Spaces:
Running
Running
File size: 38,306 Bytes
5bc0ffd 5e20a29 5bc0ffd 00b2fb8 5bc0ffd 00b2fb8 5bc0ffd 00b2fb8 5bc0ffd 8dbd1eb a1cd214 51eb07c f4ac013 a1cd214 5bc0ffd f4ac013 51eb07c f4ac013 a1cd214 f4ac013 5bc0ffd f4ac013 5bc0ffd 8d448c9 5bc0ffd 51eb07c 5bc0ffd 9e7371c 5bc0ffd 51eb07c 5bc0ffd a1cd214 51eb07c 5cd89cd a1cd214 5bc0ffd f4ac013 a7b3132 a1cd214 f4ac013 5bc0ffd 51eb07c 5bc0ffd d57c3ca 5bc0ffd d57c3ca 5bc0ffd 62aa076 5bc0ffd 62aa076 5bc0ffd d57c3ca 5bc0ffd d57c3ca 5bc0ffd 51eb07c 1e0c688 5bc0ffd d57c3ca 5bc0ffd d57c3ca 5bc0ffd d57c3ca 5bc0ffd 62aa076 5bc0ffd 62aa076 5bc0ffd d57c3ca 5bc0ffd d57c3ca 5bc0ffd 51eb07c 5bc0ffd d57c3ca 5bc0ffd 00b2fb8 5bc0ffd 00b2fb8 5bc0ffd 00b2fb8 5bc0ffd 00b2fb8 5bc0ffd 5e20a29 5bc0ffd 8dbd1eb 5bc0ffd 00b2fb8 5bc0ffd 28852e9 5bc0ffd 62aa076 5bc0ffd d57c3ca 5bc0ffd 00b2fb8 8dbd1eb 62aa076 00b2fb8 5bc0ffd 00b2fb8 62aa076 4539ed0 5bc0ffd be92a53 a1cd214 d57c3ca 5bc0ffd 8dbd1eb 5bc0ffd 62aa076 8dbd1eb eda1b2b 62aa076 eda1b2b dac7577 61b0e3a 62aa076 235f490 a1cd214 62aa076 a1cd214 eda1b2b 8dbd1eb eda1b2b 62aa076 eda1b2b dac7577 61b0e3a 62aa076 235f490 a1cd214 62aa076 454e4fc eda1b2b 8dbd1eb 62aa076 5bc0ffd d57c3ca 5bc0ffd 8dbd1eb 5bc0ffd 8dbd1eb 5bc0ffd 62aa076 8dbd1eb eda1b2b 62aa076 eda1b2b dac7577 61b0e3a 62aa076 235f490 a1cd214 62aa076 a1cd214 eda1b2b 8dbd1eb eda1b2b 454e4fc eda1b2b dac7577 454e4fc 235f490 a1cd214 454e4fc a1cd214 eda1b2b 8dbd1eb 62aa076 5bc0ffd 8dbd1eb a1cd214 5bc0ffd d57c3ca 5bc0ffd d57c3ca 5bc0ffd 8dbd1eb 5bc0ffd 51eb07c 5bc0ffd 51eb07c 5bc0ffd d57c3ca 62aa076 8dbd1eb a1cd214 62aa076 d57c3ca 8dbd1eb d57c3ca 8dbd1eb 5bc0ffd d57c3ca 5bc0ffd a1cd214 8dbd1eb 5bc0ffd a1cd214 5bc0ffd 8dbd1eb a1cd214 5bc0ffd d57c3ca 5bc0ffd d57c3ca 5bc0ffd 8dbd1eb 5bc0ffd 51eb07c 5bc0ffd 51eb07c 5bc0ffd d57c3ca 62aa076 8dbd1eb a1cd214 62aa076 d57c3ca 8dbd1eb d57c3ca 8dbd1eb 5bc0ffd d57c3ca 5bc0ffd a1cd214 8dbd1eb 5bc0ffd a1cd214 5bc0ffd a1cd214 5bc0ffd a1cd214 5bc0ffd 8dbd1eb 5bc0ffd 8dbd1eb 2189bdb 62aa076 8dbd1eb 2189bdb 8dbd1eb 5bc0ffd 8dbd1eb 5bc0ffd 8dbd1eb 2189bdb 62aa076 8dbd1eb 5bc0ffd 8dbd1eb 2189bdb 8dbd1eb 5bc0ffd a1cd214 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 |
import gradio as gr
import os
import re
import torch
import pandas as pd
import plotly.express as px
import plotly.io as pio
import nltk
import tempfile
from io import BytesIO
import base64
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from nltk.tokenize import sent_tokenize
from docx.shared import Inches
from docx import Document
import numpy as np
# Needed for HF GPU access
import spaces
from styles import custom_css # Importing custom CSS
nltk.download('punkt')
# Import PyPDFLoader for PDF processing
from langchain_community.document_loaders import PyPDFLoader
# Model checkpoint for SDG BERT
checkpoint = "sadickam/sdgBERT"
# Text cleaning function
def clean_text(text):
"""
Cleans the extracted text by removing irrelevant characters but retains currency symbols.
"""
text = text.strip()
# Define the allowed characters (including currency symbols)
allowed_chars = r'[^a-zA-Z0-9\s\.,!?$β¬Β£Β₯βΉΒ’β©]'
text = re.sub(allowed_chars, '', text)
text = re.sub(r'\s+', ' ', text) # Replace multiple spaces with a single space
return text
# Preprocessing function for text
def prep_text(text):
clean_sents = []
sent_tokens = sent_tokenize(str(text))
for sent_token in sent_tokens:
word_tokens = [str(word_token).strip().lower() for word_token in sent_token.split()]
clean_sents.append(' '.join(word_tokens))
joined = ' '.join(clean_sents).strip()
return re.sub(r'`|"', "", joined)
# Load the tokenizer and model with GPU support
def load_model_and_tokenizer():
model = AutoModelForSequenceClassification.from_pretrained(checkpoint).to(device)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
return model, tokenizer
# Define device (ensure usage of GPU if available in Hugging Face Spaces)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# SDG labels
label_list = [
'SDG1_No Poverty', 'SDG2_Zero Hunger', 'SDG3_Good Health and Well-being', 'SDG4_Quality Education',
'SDG5_Gender Equality', 'SDG6_Clean Water and Sanitation', 'SDG7_Affordable and Clean Energy',
'SDG8_Decent Work and Economic Growth', 'SDG9_Industry, Innovation and Infrastructure',
'SDG10_Reduced Inequality', 'SDG11_Sustainable Cities and Communities',
'SDG12_Responsible Consumption and Production', 'SDG13_Climate Action',
'SDG14_Life Below Water', 'SDG15_Life on Land', 'SDG16_Peace, Justice and Strong Institutions'
]
# Function to predict SDGs for a batch of text inputs
def predict_sdg_labels_batch(texts, model, tokenizer):
tokenized_texts = tokenizer(texts, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
model.eval()
with torch.no_grad():
text_logits = model(**tokenized_texts).logits
predictions = torch.softmax(text_logits, dim=1).tolist()
return predictions
# Page-level predictions with batch processing
def predict_pages(page_df, batch_size=32):
model, tokenizer = load_model_and_tokenizer()
df_results = page_df.copy()
num_rows = len(page_df)
all_predicted_labels = [[] for _ in range(16)]
all_prediction_scores = [[] for _ in range(16)]
for start in range(0, num_rows, batch_size):
end = min(start + batch_size, num_rows)
df_chunk = page_df.iloc[start:end]
# Clean text
texts = df_chunk['Text'].apply(clean_text).apply(prep_text).tolist()
predictions_batch = predict_sdg_labels_batch(texts, model, tokenizer)
for predictions in predictions_batch:
sorted_preds = sorted(zip(label_list, predictions), key=lambda x: x[1], reverse=True)
for i, (label, score) in enumerate(sorted_preds):
all_predicted_labels[i].append(label)
all_prediction_scores[i].append(score)
# Add columns to the DataFrame in the desired order (pred1, score1, pred2, score2, ...)
for i in range(16):
df_results[f'pred{i + 1}'] = all_predicted_labels[i]
df_results[f'score{i + 1}'] = all_prediction_scores[i]
# Reorder columns to ensure preds and scores are interleaved in the correct order
reordered_columns = []
for i in range(16):
reordered_columns.append(f'pred{i + 1}')
reordered_columns.append(f'score{i + 1}')
other_columns = [col for col in df_results.columns if col not in reordered_columns]
df_results = df_results[other_columns + reordered_columns]
return df_results
# Sentence-level predictions with batch processing
def predict_sentences(sentence_df, batch_size=32):
model, tokenizer = load_model_and_tokenizer()
df_combined_sentences = sentence_df.copy()
num_rows = len(sentence_df)
all_predicted_labels = [[] for _ in range(16)]
all_prediction_scores = [[] for _ in range(16)]
for start in range(0, num_rows, batch_size):
end = min(start + batch_size, num_rows)
df_chunk = sentence_df.iloc[start:end]
# Clean text
texts = df_chunk['Sentence'].apply(clean_text).apply(prep_text).tolist()
predictions_batch = predict_sdg_labels_batch(texts, model, tokenizer)
for predictions in predictions_batch:
sorted_preds = sorted(zip(label_list, predictions), key=lambda x: x[1], reverse=True)
for i, (label, score) in enumerate(sorted_preds):
all_predicted_labels[i].append(label)
all_prediction_scores[i].append(round(score, 3))
# Add predictions and scores to DataFrame
for i in range(16):
df_combined_sentences[f'pred{i + 1}'] = all_predicted_labels[i]
df_combined_sentences[f'score{i + 1}'] = all_prediction_scores[i]
# Reorder columns
reordered_columns = []
for i in range(16):
reordered_columns.append(f'pred{i + 1}')
reordered_columns.append(f'score{i + 1}')
other_columns = [col for col in df_combined_sentences.columns if col not in reordered_columns]
df_combined_sentences = df_combined_sentences[other_columns + reordered_columns]
return df_combined_sentences
# Define unique colors for each SDG
sdg_colors = {
"SDG1_No Poverty": "#E5243B",
"SDG2_Zero Hunger": "#DDA63A",
"SDG3_Good Health and Well-being": "#4C9F38",
"SDG4_Quality Education": "#C5192D",
"SDG5_Gender Equality": "#FF3A21",
"SDG6_Clean Water and Sanitation": "#26BDE2",
"SDG7_Affordable and Clean Energy": "#FCC30B",
"SDG8_Decent Work and Economic Growth": "#A21942",
"SDG9_Industry, Innovation and Infrastructure": "#FD6925",
"SDG10_Reduced Inequality": "#DD1367",
"SDG11_Sustainable Cities and Communities": "#FD9D24",
"SDG12_Responsible Consumption and Production": "#BF8B2E",
"SDG13_Climate Action": "#3F7E44",
"SDG14_Life Below Water": "#0A97D9",
"SDG15_Life on Land": "#56C02B",
"SDG16_Peace, Justice and Strong Institutions": "#00689D"
}
# Function to plot SDG dominant bar graphs using Plotly
# Function to plot SDG dominant bar graphs using Plotly
def plot_sdg(df, title, pred_column, x_axis_title=None, y_axis_title=None, icons_folder='assets/icons/'):
"""
Plots a horizontal bar graph of SDG predictions and superimposes the icon of the most frequent SDG.
Args:
df (pd.DataFrame): DataFrame containing SDG predictions.
title (str): Title of the plot.
pred_column (str): Column name to use for plotting (e.g., 'pred1').
x_axis_title (str): Title for the x-axis.
y_axis_title (str): Title for the y-axis.
icons_folder (str): Path to the folder containing SDG icons.
Returns:
plotly.graph_objs._figure.Figure: The Plotly figure object.
"""
df_filtered = df[df[pred_column].notna()]
labels = df_filtered[pred_column].value_counts().sort_values(ascending=False)
total = labels.sum()
percentages = (labels / total) * 100
# Create a horizontal bar plot with Plotly
fig = px.bar(
percentages.rename_axis('SDG Label').reset_index(name='Percentage'),
y='SDG Label',
x='Percentage',
orientation='h',
title=title,
color='SDG Label',
color_discrete_map=sdg_colors # Use the defined unique colors for each SDG
)
# Update y-axis to show labels
fig.update_yaxes(showticklabels=True)
# Add percentage labels to the bars
fig.update_traces(
texttemplate='%{x:.2f}%',
textposition='auto',
textfont=dict(size=11)
)
# Adjust layout for better visibility
fig.update_layout(
title=dict(
text=title, font=dict(size=14) # Increase title font size
),
yaxis=dict(
automargin=True,
title=y_axis_title,
tickfont=dict(size=13)
),
margin=dict(l=20, r=100, t=30, b=20), # Increased right margin for icon
height=600,
#width=800,
showlegend=False,
template="simple_white",
xaxis=dict(
title=x_axis_title,
tickfont=dict(size=13) # Reduce x-axis font size
),
)
# Identify the most frequent SDG
if not percentages.empty:
top_sdg_label = percentages.index[0] # e.g., 'SDG1_No Poverty'
# Map SDG label to icon filename
# Assuming naming convention 'SDG1.png', 'SDG2.png', etc.
sdg_number = top_sdg_label.split('_')[0] # Extract 'SDG1'
icon_filename = f"{sdg_number}.png" # e.g., 'SDG1.png'
icon_path = os.path.join(icons_folder, icon_filename)
# Check if the icon file exists
if os.path.exists(icon_path):
# Read and encode the image
with open(icon_path, 'rb') as image_file:
encoded_image = base64.b64encode(image_file.read()).decode('utf-8')
# Add the icon as an image in the Plotly figure
fig.add_layout_image(
dict(
source='data:image/png;base64,' + encoded_image,
xref="paper", yref="paper",
x=.98, y=1.0, # Positioning: slightly to the right and top
sizex=0.2, sizey=0.2, # Size of the icon
xanchor="left",
yanchor="top",
layer="above" # Ensure the icon is above other plot elements
)
)
else:
print(f"Icon file '{icon_path}' not found. Skipping icon overlay.")
return fig
def save_figure_as_jpeg(fig, filename):
"""Saves the Plotly figure as a high-resolution JPEG."""
pio.write_image(fig, filename, format='jpeg', width=700, height=650, scale=7, engine="kaleido")
# Generate reports (page and sentence levels)
def generate_page_report(df_pages, report_file_name):
doc = Document()
doc.add_heading("Page-Level SDG Analysis Report", 0)
doc.add_heading("π General Notes", level=2)
doc.add_paragraph(
'This app conducts page-level analysis of the uploaded document. Each page is processed by the sdgBERT AI model trained to predict the first 16 '
'Sustainable Development Goals (SDGs). The model analyzes the content and returns scores '
'representing the likelihood that the text is aligned with particular SDGs. This page-level '
'analysis provides high-level insight into SDG alignment.'
'\n\n'
'Given that a page may align with more than one SDG, this app focuses on the top two SDG predictions '
'(Primary and Secondary) for each page with a probability score greater than zero.'
)
doc.add_heading("Primary SDGs Bar Graph", level=3)
doc.add_paragraph(
'This graph displays the most essential SDG the AI model associates with pages. The bars '
'represent the percentage of pages most strongly aligned with each SDG. This offers insight into the dominant '
'sustainable development theme within the document.'
)
doc.add_heading("Secondary SDGs Bar Graph", level=3)
doc.add_paragraph(
'This graph shows the second most relevant SDGs for pages. Although these SDGs are '
'not the primary focus, the text has some relevance to these goals.'
)
for doc_name in df_pages['Document'].unique():
# Sanitize doc_name to use in file names
sanitized_doc_name = re.sub(r'[^\w\-]', '_', os.path.splitext(doc_name)[0])
doc.add_heading(f"π Document: {doc_name}", level=2)
df_doc = df_pages[df_pages['Document'] == doc_name]
# Generate and save graphs
first_sdg_plot_path = f"{sanitized_doc_name}_first_sdg_page.jpeg"
second_sdg_plot_path = f"{sanitized_doc_name}_second_sdg_page.jpeg"
plot_sdg(
df_doc, "", 'pred1',
x_axis_title="Percentage (%) of aligned pages",
y_axis_title="Primary SDGs"
).write_image(first_sdg_plot_path, format='jpeg', scale=7, engine="kaleido")
plot_sdg(
df_doc, "", 'pred2',
x_axis_title="Percentage (%) of aligned pages",
y_axis_title="Secondary SDGs"
).write_image(second_sdg_plot_path, format='jpeg', scale=7, engine="kaleido")
# Add plots to the Word document
doc.add_picture(first_sdg_plot_path, width=Inches(6))
doc.add_picture(second_sdg_plot_path, width=Inches(6))
doc.save(report_file_name)
return report_file_name
def generate_sentence_report(df_sentences, report_file_name):
doc = Document()
doc.add_heading("Sentence-Level SDG Analysis Report", 0)
doc.add_heading("π General Notes", level=2)
doc.add_paragraph(
'This app splits documents into sentences using a natural language processing algorithm. '
'Each sentence is processed by the sdgBERT AI model trained to predict the first 16 '
'Sustainable Development Goals (SDGs). The model analyzes the content and returns scores '
'representing the likelihood that the text is aligned with particular SDGs. This sentence-level '
'analysis provides deeper insight into SDG alignment.'
'\n\n'
'Given that a sentence may align with more than one SDG, this app focuses on the top two SDG predictions '
'(Primary and Secondary) for each sentence with a probability score greater than zero.'
)
doc.add_heading("Primary SDGs Bar Graph", level=3)
doc.add_paragraph(
'This graph displays the most essential SDG the AI model associates with sentences. The bars '
'represent the percentage of sentences most strongly aligned with each SDG. This offers more profound insight '
'into the dominant sustainable development theme within the document.'
)
doc.add_heading("Secondary SDGs Bar Graph", level=3)
doc.add_paragraph(
'This graph shows the second most relevant SDGs for sentences. Although these SDGs are not '
'the primary focus, the text has some relevance to these goals.'
)
for doc_name in df_sentences['Document'].unique():
# Sanitize doc_name to use in file names
sanitized_doc_name = re.sub(r'[^\w\-]', '_', os.path.splitext(doc_name)[0])
doc.add_heading(f"π Document: {doc_name}", level=2)
df_doc = df_sentences[df_sentences['Document'] == doc_name]
# Generate and save graphs
first_sdg_plot_path = f"{sanitized_doc_name}_first_sdg_sentence.jpeg"
second_sdg_plot_path = f"{sanitized_doc_name}_second_sdg_sentence.jpeg"
plot_sdg(
df_doc, "", 'pred1',
x_axis_title="Percentage (%) of aligned sentences",
y_axis_title="Primary SDGs"
).write_image(first_sdg_plot_path, format='jpeg', scale=7, engine="kaleido")
plot_sdg(
df_doc, "", 'pred2',
x_axis_title="Percentage (%) of aligned sentences",
y_axis_title="Secondary SDGs"
).write_image(second_sdg_plot_path, format='jpeg', scale=7, engine="kaleido")
# Add plots to the Word document
doc.add_picture(first_sdg_plot_path, width=Inches(6))
doc.add_picture(second_sdg_plot_path, width=Inches(6))
doc.save(report_file_name)
return report_file_name
# New text extraction functions with text cleaning and line joining
def extract_text_with_py_pdf_loader(pdf_file_path, start_page=None, end_page=None):
"""
Extract text from a PDF page by page using LangChain's PyPDFLoader.
Args:
pdf_file_path (str): The file path to the uploaded PDF.
start_page (int, optional): The starting page number for extraction (1-based index).
end_page (int, optional): The ending page number for extraction (1-based index).
Returns:
tuple:
- page_df (pd.DataFrame): DataFrame containing Document, Page, and Text.
- sentence_df (pd.DataFrame): DataFrame containing Document, Page, and Sentence.
"""
try:
# Initialize the loader
loader = PyPDFLoader(pdf_file_path)
documents = loader.load_and_split() # Each document corresponds to a single page
total_pages = len(documents)
doc_name = os.path.basename(pdf_file_path) # Extract document name
# Validate and adjust page range
if start_page is not None and end_page is not None:
# Convert to integers to avoid slicing issues
start_page = int(start_page)
end_page = int(end_page)
# Adjust to valid range
if start_page < 1:
start_page = 1
if end_page > total_pages:
end_page = total_pages
if start_page > end_page:
start_page, end_page = end_page, start_page # Swap if out of order
# Select the subset of documents based on user input
selected_docs = documents[start_page - 1:end_page]
else:
selected_docs = documents
start_page = 1
end_page = total_pages
# Initialize lists to store data
page_data = []
sentence_data = []
for idx, doc in enumerate(selected_docs, start=start_page):
page_num = idx
text = doc.page_content.strip()
# Join lines that belong to the same sentence
lines = text.split('\n')
joined_text = ' '.join(line.strip() for line in lines if line.strip())
# Clean text
cleaned_text = clean_text(joined_text)
# Append page-wise data
page_data.append({
"Document": doc_name,
"Page": page_num,
"Text": cleaned_text
})
# Sentence tokenization
sentences = sent_tokenize(cleaned_text)
for sentence in sentences:
sentence = sentence.strip()
if sentence and len(sentence) > 70:
sentence_data.append({
"Document": doc_name,
"Page": page_num,
"Sentence": sentence
})
# Create DataFrames
page_df = pd.DataFrame(page_data)
sentence_df = pd.DataFrame(sentence_data)
return page_df, sentence_df
except Exception as e:
raise RuntimeError(f"Error during PDF extraction: {e}")
def df_to_csv_bytes(df):
"""
Convert DataFrame to CSV in bytes.
Args:
df (pd.DataFrame): The DataFrame to convert.
Returns:
bytes: CSV data in bytes.
"""
try:
buffer = BytesIO()
df.to_csv(buffer, index=False)
csv_data = buffer.getvalue()
buffer.close()
return csv_data
except Exception as e:
raise RuntimeError(f"Error during CSV conversion: {e}")
def launch_interface():
with gr.Blocks(css=custom_css) as demo:
# Title as a visible heading at the top of the page with an icon
gr.Markdown(
"""
# π SDG Document Analysis App
Analyze documents to map Sustainable Development Goals (SDGs) at both page and sentence levels.
"""
)
# Shared PDF file input for both analyses
gr.Markdown("## Upload PDF File")
with gr.Row():
file_input = gr.File(
label="π Upload PDF File for Analysis", file_types=[".pdf"]
)
# Extraction mode selection with explanatory text
gr.Markdown(
"""
## PDF Text Extraction Mode
Choose whether to analyze all pages or a specific range of pages. If you want to exclude certain pages from the analysis, select
"Range of Pages" and specify the start and end pages.
"""
)
with gr.Row():
extraction_mode = gr.Radio(
choices=["All Pages", "Range of Pages"],
value="All Pages",
label="Extraction Mode"
)
with gr.Row():
start_page = gr.Number(value=1, label="π’ Start Page", visible=False, info="The cover page is page 1")
end_page = gr.Number(value=1, label="π’ End Page", visible=False)
# Function to update visibility of start_page and end_page
def update_page_inputs(extraction_mode):
if extraction_mode == "Range of Pages":
return gr.update(visible=True), gr.update(visible=True)
else:
return gr.update(visible=False), gr.update(visible=False)
extraction_mode.change(
update_page_inputs,
inputs=extraction_mode,
outputs=[start_page, end_page]
)
# Main Tabs for Page-Level and Sentence-Level Analysis
gr.Markdown("## SDG Analysis Type")
with gr.Tab("π Page-Level Analysis"):
gr.Markdown(
"""
### Page-Level SDG Analysis
This section conducts Sustainable Development Goals (SDG) mapping
of documents using the [sdgBERT model](https://huggingface.co/sadickam/sdgBERT).
It provides **high-level SDG mapping** of documents at the page level.
"""
)
with gr.Row():
page_button = gr.Button("πββοΈ Run Page-Level Analysis")
reset_page_button = gr.Button("π Reset Page-Level Analysis", elem_classes="reset-button")
# Nested Tabs for Primary and Secondary SDGs
with gr.Tabs():
with gr.TabItem("π Primary SDGs"):
with gr.Row():
primary_page_plot = gr.Plot(label="π Primary SDGs Graph [Page-Level]", scale=2)
gr.Markdown(
"When the analysis is done, the Primary SDGs bar graph on the left will show "+
"the percentage of pages that strongly align with each SDG. The icon for the most frequent "+
"SDG will be highlighted on the graph. Download the Page Predictions CSV for further details.",
label = "Note", container=True
)
gr.Markdown("##### Download Results")
with gr.Row():
page_csv = gr.File(label="π Download Page Predictions CSV")
page_docx = gr.File(label="π Download Page Report DOCX")
page_jpeg1 = gr.File(label="πΌοΈ Download Primary SDGs JPEG")
with gr.TabItem("π Secondary SDGs"):
with gr.Row():
secondary_page_plot = gr.Plot(label="π Secondary SDGs Graph [Page-Level]", scale=2)
gr.Markdown(
"When the analysis is done, the Secondary SDGs bar graph on the left will show "+
"SDGs that are not the primary focus of the pages analysed. These SDGs are second to the "+
"Primary SDGs. Download the Sentence Predictions CSV for further details",
label = "Note", container=True
)
gr.Markdown("##### Download Results")
with gr.Row():
page_csv_secondary = gr.File(label="π Download Page Predictions CSV")
page_report_file_secondary = gr.File(label="π Download Page Report DOCX")
secondary_page_jpeg = gr.File(label="πΌοΈ Download Secondary SDGs JPEG")
with gr.Tab("βοΈ Sentence-Level Analysis"):
gr.Markdown(
"""
### Sentence-Level SDG Analysis
This section conducts Sustainable Development Goals (SDG) mapping
using the [sdgBERT model](https://huggingface.co/sadickam/sdgBERT).
It provides **detailed SDG mapping** at the sentence level.
"""
)
with gr.Row():
sentence_button = gr.Button("πββοΈ Run Sentence-Level Analysis")
reset_sentence_button = gr.Button("π Reset Sentence-Level Analysis", elem_classes="reset-button")
# Nested Tabs for Primary and Secondary SDGs
with gr.Tabs():
with gr.TabItem("π Primary SDGs"):
with gr.Row():
primary_sentence_plot = gr.Plot(label="π Primary SDGs Graph [Sentence-Level]", scale=2)
gr.Markdown(
"When the analysis is done, the Primary SDGs bar graph on the left will show "+
"the percentage of sentences that strongly align with each SDG. The icon for the most frequent "+
"SDG will be highlighted on the graph. Download the Sentence Predictions CSV for further details.",
label = "Note", container=True
)
gr.Markdown("##### Download Results")
with gr.Row():
sentence_csv = gr.File(label="π Download Sentence Predictions CSV")
sentence_docx = gr.File(label="π Download Sentence Report DOCX")
sentence_jpeg1 = gr.File(label="πΌοΈ Download Primary SDGs JPEG")
with gr.TabItem("π Secondary SDGs"):
with gr.Row():
secondary_sentence_plot = gr.Plot(label="π Secondary SDGs Graph [Sentence-Level]", scale=2)
gr.Markdown(
"When the analysis is done, the Secondary SDGs bar graph on the left will show "+
"SDGs that are not the primary focus of the sentences analysed. These SDGs are second to the "+
"Primary SDGs. Download the Sentence Predictions CSV for further details",
label = "Note", container=True
)
gr.Markdown("##### Download Results")
with gr.Row():
sentence_csv_secondary = gr.File(label="π Download Sentence Predictions CSV")
sentence_report_file_secondary = gr.File(label="π Download Sentence Report DOCX")
secondary_sentence_jpeg = gr.File(label="πΌοΈ Download Secondary SDGs JPEG")
# Function to process page-level analysis
@spaces.GPU
def process_pages(file, extraction_mode, start_page, end_page):
if not file:
# Return None for each output component
return [None, None, None, None, None, None, None, None]
try:
if hasattr(file, 'name'):
pdf_file_path = file.name
original_file_name = os.path.basename(file.name)
else:
# Save the file to a temporary location
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_pdf:
temp_pdf.write(file.read())
pdf_file_path = temp_pdf.name
original_file_name = 'uploaded_document'
# Sanitize the file name to use in output file names
sanitized_file_name = os.path.splitext(original_file_name)[0]
sanitized_file_name = re.sub(r'[^\w\-]', '_', sanitized_file_name)
# Determine page range based on extraction_mode
if extraction_mode == "All Pages":
selected_start = None
selected_end = None
else:
selected_start = int(start_page)
selected_end = int(end_page)
# Extract text and create DataFrames
page_df, _ = extract_text_with_py_pdf_loader(
pdf_file_path,
start_page=selected_start,
end_page=selected_end
)
# Predict SDGs at page level
df_page_predictions = predict_pages(page_df)
# Generate plots with icon overlay
first_plot = plot_sdg(
df_page_predictions, "",
'pred1',
x_axis_title="Percentage (%) of aligned pages",
y_axis_title="Primary SDGs"
)
second_plot = plot_sdg(
df_page_predictions, "",
'pred2',
x_axis_title="Percentage (%) of aligned pages",
y_axis_title="Secondary SDGs"
)
# Define output file names
page_csv_file = f"{sanitized_file_name}_SDG-Page_predictions.csv"
page_report_file = f"{sanitized_file_name}_SDG-Page_report.docx"
primary_page_jpeg = f"{sanitized_file_name}_SDG-Page_primary_graph.jpeg"
page_csv_file_secondary = f"{sanitized_file_name}_SDG-Page_predictions.csv"
page_report_file_secondary = f"{sanitized_file_name}_SDG-Page_report.docx"
secondary_page_jpeg = f"{sanitized_file_name}_SDG-Page_secondary_graph.jpeg"
# Save CSV and reports
df_page_predictions.to_csv(page_csv_file, index=False)
page_report_primary = generate_page_report(df_page_predictions, page_report_file)
df_page_predictions.to_csv(page_csv_file_secondary, index=False)
page_report_secondary = generate_page_report(df_page_predictions, page_report_file_secondary)
# Save figures as JPEG
save_figure_as_jpeg(first_plot, primary_page_jpeg)
save_figure_as_jpeg(second_plot, secondary_page_jpeg)
return (
first_plot, second_plot,
page_csv_file, page_report_file, primary_page_jpeg,
page_csv_file_secondary, page_report_file_secondary, secondary_page_jpeg
)
except Exception as e:
print(f"Error: {e}")
return [None, None, None, None, None, None, None, None]
# Function to process sentence-level analysis
@spaces.GPU
def process_sentences(file, extraction_mode, start_page, end_page):
if not file:
# Return None for each output component
return [None, None, None, None, None, None, None, None]
try:
if hasattr(file, 'name'):
pdf_file_path = file.name
original_file_name = os.path.basename(file.name)
else:
# Save the file to a temporary location
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_pdf:
temp_pdf.write(file.read())
pdf_file_path = temp_pdf.name
original_file_name = 'uploaded_document'
# Sanitize the file name to use in output file names
sanitized_file_name = os.path.splitext(original_file_name)[0]
sanitized_file_name = re.sub(r'[^\w\-]', '_', sanitized_file_name)
# Determine page range based on extraction_mode
if extraction_mode == "All Pages":
selected_start = None
selected_end = None
else:
selected_start = int(start_page)
selected_end = int(end_page)
# Extract text and create DataFrames
_, sentence_df = extract_text_with_py_pdf_loader(
pdf_file_path,
start_page=selected_start,
end_page=selected_end
)
# Predict SDGs at sentence level
df_sentence_predictions = predict_sentences(sentence_df)
# Generate plots with icon overlay
first_plot = plot_sdg(
df_sentence_predictions, "",
'pred1',
x_axis_title="Percentage (%) of aligned sentences",
y_axis_title="Primary SDGs"
)
second_plot = plot_sdg(
df_sentence_predictions, "",
'pred2',
x_axis_title="Percentage (%) of aligned sentences",
y_axis_title="Secondary SDGs"
)
# Define output file names
sentence_csv_file = f"{sanitized_file_name}_SDG-Sentence_predictions.csv"
sentence_report_file = f"{sanitized_file_name}_SDG-Sentence_report.docx"
primary_sentence_jpeg = f"{sanitized_file_name}_SDG-Sentence_primary_graph.jpeg"
sentence_csv_file_secondary = f"{sanitized_file_name}_SDG-Sentence_predictions.csv"
sentence_report_file_secondary = f"{sanitized_file_name}_SDG-Sentence_report.docx"
secondary_sentence_jpeg = f"{sanitized_file_name}_SDG-Sentence_secondary_graph.jpeg"
# Save CSV and reports
df_sentence_predictions.to_csv(sentence_csv_file, index=False)
sentence_report_primary = generate_sentence_report(df_sentence_predictions, sentence_report_file)
df_sentence_predictions.to_csv(sentence_csv_file_secondary, index=False)
sentence_report_secondary = generate_sentence_report(df_sentence_predictions, sentence_report_file_secondary)
# Save figures as JPEG
save_figure_as_jpeg(first_plot, primary_sentence_jpeg)
save_figure_as_jpeg(second_plot, secondary_sentence_jpeg)
return (
first_plot, second_plot,
sentence_csv_file, sentence_report_file, primary_sentence_jpeg,
sentence_csv_file_secondary, sentence_report_file_secondary, secondary_sentence_jpeg
)
except Exception as e:
print(f"Error: {e}")
return [None, None, None, None, None, None, None, None]
# Reset functions to clear the outputs
def reset_page_outputs():
return [None, None, None, None, None, None, None, None]
def reset_sentence_outputs():
return [None, None, None, None, None, None, None, None]
# Button actions for Page-Level Analysis
page_button.click(
process_pages,
inputs=[file_input, extraction_mode, start_page, end_page],
outputs=[
primary_page_plot, # π Primary SDGs [Page-Level]
secondary_page_plot, # π Secondary SDGs [Page-Level]
page_csv, # π Download Page Predictions CSV
page_docx, # π Download Page Report DOCX
page_jpeg1, # πΌοΈ Download Primary SDGs JPEG
page_csv_secondary, # π Download Page Predictions CSV
page_report_file_secondary, # π Download Page Report DOCX
secondary_page_jpeg # πΌοΈ Download Secondary SDGs JPEG
]
)
reset_page_button.click(
reset_page_outputs,
outputs=[
primary_page_plot,
secondary_page_plot,
page_csv,
page_docx,
page_jpeg1,
page_csv_secondary,
page_report_file_secondary,
secondary_page_jpeg
]
)
# Button actions for Sentence-Level Analysis
sentence_button.click(
process_sentences,
inputs=[file_input, extraction_mode, start_page, end_page],
outputs=[
primary_sentence_plot, # π Primary SDGs [Sentence-Level]
secondary_sentence_plot, # π Secondary SDGs [Sentence-Level]
sentence_csv, # π Download Sentence Predictions CSV
sentence_docx, # π Download Sentence Report DOCX
sentence_jpeg1, # πΌοΈ Download Primary SDGs JPEG
sentence_csv_secondary, # π Download Sentence Predictions CSV
sentence_report_file_secondary, # π Download Sentence Report DOCX
secondary_sentence_jpeg # πΌοΈ Download Secondary SDGs JPEG
]
)
reset_sentence_button.click(
reset_sentence_outputs,
outputs=[
primary_sentence_plot,
secondary_sentence_plot,
sentence_csv,
sentence_docx,
sentence_jpeg1,
sentence_csv_secondary,
sentence_report_file_secondary,
secondary_sentence_jpeg
]
)
demo.queue().launch()
launch_interface() |