sadickam commited on
Commit
5bc0ffd
·
verified ·
1 Parent(s): 79a56c6

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +642 -0
app.py ADDED
@@ -0,0 +1,642 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ import re
4
+ import torch
5
+ import pandas as pd
6
+ import plotly.express as px
7
+ import plotly.io as pio
8
+ import nltk
9
+ import tempfile
10
+ from io import BytesIO
11
+ import base64
12
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
13
+ from nltk.tokenize import sent_tokenize
14
+ from docx.shared import Inches
15
+ from docx import Document
16
+ import numpy as np
17
+ # Needed for HF GPU access
18
+ import spaces
19
+
20
+ nltk.download('punkt')
21
+
22
+ # Import PyPDFLoader for PDF processing
23
+ from langchain_community.document_loaders import PyPDFLoader
24
+
25
+ # Model checkpoint for SDG BERT
26
+ checkpoint = "sadickam/sdgBERT"
27
+
28
+ # Preprocessing function for text
29
+ def prep_text(text):
30
+ clean_sents = []
31
+ sent_tokens = sent_tokenize(str(text))
32
+ for sent_token in sent_tokens:
33
+ word_tokens = [str(word_token).strip().lower() for word_token in sent_token.split()]
34
+ clean_sents.append(' '.join(word_tokens))
35
+ joined = ' '.join(clean_sents).strip()
36
+ return re.sub(r'`|"', "", joined)
37
+
38
+ # Load the tokenizer and model with GPU support
39
+ def load_model_and_tokenizer():
40
+ model = AutoModelForSequenceClassification.from_pretrained(checkpoint).to(device)
41
+ tokenizer = AutoTokenizer.from_pretrained(checkpoint)
42
+ return model, tokenizer
43
+
44
+ # Define device (ensure usage of GPU if available in Hugging Face Spaces)
45
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
46
+
47
+ # SDG labels
48
+ label_list = [
49
+ 'SDG1_No Poverty', 'SDG2_Zero Hunger', 'SDG3_Good Health and Well-being', 'SDG4_Quality Education',
50
+ 'SDG5_Gender Equality', 'SDG6_Clean Water and Sanitation', 'SDG7_Affordable and Clean Energy',
51
+ 'SDG8_Decent Work and Economic Growth', 'SDG9_Industry, Innovation and Infrastructure',
52
+ 'SDG10_Reduced Inequality', 'SDG11_Sustainable Cities and Communities',
53
+ 'SDG12_Responsible Consumption and Production', 'SDG13_Climate Action',
54
+ 'SDG14_Life Below Water', 'SDG15_Life on Land', 'SDG16_Peace, Justice and Strong Institutions'
55
+ ]
56
+
57
+ # Function to predict SDGs for a batch of text inputs
58
+ def predict_sdg_labels_batch(texts, model, tokenizer):
59
+ tokenized_texts = tokenizer(texts, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
60
+ model.eval()
61
+ with torch.no_grad():
62
+ text_logits = model(**tokenized_texts).logits
63
+ predictions = torch.softmax(text_logits, dim=1).tolist()
64
+ return predictions
65
+
66
+ # Page-level predictions with batch processing
67
+ def predict_pages(page_df, batch_size=32):
68
+ model, tokenizer = load_model_and_tokenizer()
69
+ df_results = page_df.copy()
70
+ num_rows = len(page_df)
71
+ all_predicted_labels = [[] for _ in range(16)]
72
+ all_prediction_scores = [[] for _ in range(16)]
73
+
74
+ for start in range(0, num_rows, batch_size):
75
+ end = min(start + batch_size, num_rows)
76
+ df_chunk = page_df.iloc[start:end]
77
+ texts = df_chunk['Text'].apply(prep_text).tolist()
78
+ predictions_batch = predict_sdg_labels_batch(texts, model, tokenizer)
79
+ for predictions in predictions_batch:
80
+ sorted_preds = sorted(zip(label_list, predictions), key=lambda x: x[1], reverse=True)
81
+ for i, (label, score) in enumerate(sorted_preds):
82
+ all_predicted_labels[i].append(label)
83
+ all_prediction_scores[i].append(score)
84
+
85
+ # Add columns to the DataFrame in the desired order (pred1, score1, pred2, score2, ...)
86
+ for i in range(16):
87
+ df_results[f'pred{i + 1}'] = all_predicted_labels[i]
88
+ df_results[f'score{i + 1}'] = all_prediction_scores[i]
89
+
90
+ # Reorder columns to ensure preds and scores are interleaved in the correct order
91
+ reordered_columns = []
92
+ for i in range(16):
93
+ reordered_columns.append(f'pred{i + 1}')
94
+ reordered_columns.append(f'score{i + 1}')
95
+ other_columns = [col for col in df_results.columns if col not in reordered_columns]
96
+ df_results = df_results[other_columns + reordered_columns]
97
+
98
+ return df_results
99
+
100
+ # Sentence-level predictions with batch processing
101
+ def predict_sentences(sentence_df, batch_size=32):
102
+ model, tokenizer = load_model_and_tokenizer()
103
+ df_combined_sentences = sentence_df.copy()
104
+
105
+ num_rows = len(sentence_df)
106
+ all_predicted_labels = [[] for _ in range(16)]
107
+ all_prediction_scores = [[] for _ in range(16)]
108
+
109
+ for start in range(0, num_rows, batch_size):
110
+ end = min(start + batch_size, num_rows)
111
+ df_chunk = sentence_df.iloc[start:end]
112
+ texts = df_chunk['Sentence'].apply(prep_text).tolist()
113
+ predictions_batch = predict_sdg_labels_batch(texts, model, tokenizer)
114
+ for predictions in predictions_batch:
115
+ sorted_preds = sorted(zip(label_list, predictions), key=lambda x: x[1], reverse=True)
116
+ for i, (label, score) in enumerate(sorted_preds):
117
+ all_predicted_labels[i].append(label)
118
+ all_prediction_scores[i].append(round(score, 3))
119
+
120
+ # Add predictions and scores to DataFrame
121
+ for i in range(16):
122
+ df_combined_sentences[f'pred{i + 1}'] = all_predicted_labels[i]
123
+ df_combined_sentences[f'score{i + 1}'] = all_prediction_scores[i]
124
+
125
+ # Reorder columns
126
+ reordered_columns = []
127
+ for i in range(16):
128
+ reordered_columns.append(f'pred{i + 1}')
129
+ reordered_columns.append(f'score{i + 1}')
130
+ other_columns = [col for col in df_combined_sentences.columns if col not in reordered_columns]
131
+ df_combined_sentences = df_combined_sentences[other_columns + reordered_columns]
132
+
133
+ return df_combined_sentences
134
+
135
+ # Define unique colors for each SDG
136
+ sdg_colors = {
137
+ "SDG1_No Poverty": "#E5243B",
138
+ "SDG2_Zero Hunger": "#DDA63A",
139
+ "SDG3_Good Health and Well-being": "#4C9F38",
140
+ "SDG4_Quality Education": "#C5192D",
141
+ "SDG5_Gender Equality": "#FF3A21",
142
+ "SDG6_Clean Water and Sanitation": "#26BDE2",
143
+ "SDG7_Affordable and Clean Energy": "#FCC30B",
144
+ "SDG8_Decent Work and Economic Growth": "#A21942",
145
+ "SDG9_Industry, Innovation and Infrastructure": "#FD6925",
146
+ "SDG10_Reduced Inequality": "#DD1367",
147
+ "SDG11_Sustainable Cities and Communities": "#FD9D24",
148
+ "SDG12_Responsible Consumption and Production": "#BF8B2E",
149
+ "SDG13_Climate Action": "#3F7E44",
150
+ "SDG14_Life Below Water": "#0A97D9",
151
+ "SDG15_Life on Land": "#56C02B",
152
+ "SDG16_Peace, Justice and Strong Institutions": "#00689D"
153
+ }
154
+
155
+ # Function to plot SDG dominant bar graphs using Plotly
156
+ def plot_sdg(df, title, pred_column):
157
+ """Plots a bar graph for SDG data using Plotly.
158
+ Args:
159
+ df: DataFrame containing SDG predictions.
160
+ title: Title of the plot.
161
+ pred_column: Column to use for plotting.
162
+ """
163
+ df_filtered = df[df[pred_column].notna()]
164
+ labels = df_filtered[pred_column].value_counts().sort_values(ascending=False)
165
+ total = labels.sum()
166
+ percentages = (labels / total) * 100
167
+
168
+ # Create a bar plot with Plotly
169
+ fig = px.bar(
170
+ percentages.rename_axis('SDG Label').reset_index(name='Percentage'),
171
+ y='SDG Label',
172
+ x='Percentage',
173
+ orientation='h',
174
+ title=title,
175
+ color='SDG Label',
176
+ color_discrete_map=sdg_colors # Use the defined unique colors for each SDG
177
+ )
178
+
179
+ # Update y-axis to show labels
180
+ fig.update_yaxes(showticklabels=True)
181
+
182
+ # Add percentage labels to the bars
183
+ fig.update_traces(
184
+ texttemplate='%{x:.2f}%',
185
+ textposition='auto',
186
+ textfont=dict(size=10)
187
+ )
188
+
189
+ # Adjust layout for better visibility
190
+ fig.update_layout(
191
+ title=dict(
192
+ text=title, font=dict(size=14) # Increase title font size
193
+ ),
194
+ yaxis=dict(
195
+ automargin=True,
196
+ title=None,
197
+ tickfont=dict(size=12)
198
+ ),
199
+ margin=dict(l=20, r=5, t=30, b=20),
200
+ height=600,
201
+ width=700,
202
+ showlegend=False,
203
+ template="simple_white",
204
+ xaxis=dict(
205
+ tickfont=dict(size=12) # Reduce x-axis font size
206
+ ),
207
+ )
208
+
209
+ return fig
210
+
211
+ def save_figure_as_jpeg(fig, filename):
212
+ """Saves the Plotly figure as a high-resolution JPEG."""
213
+ pio.write_image(fig, filename, format='jpeg', width=1000, height=600, scale=5)
214
+
215
+ # Generate reports (page and sentence levels)
216
+ def generate_page_report(df_pages):
217
+ doc = Document()
218
+ doc.add_heading("Page-Level SDG Analysis Report", 0)
219
+
220
+ doc.add_heading("General Notes", level=2)
221
+ doc.add_paragraph(
222
+ 'This app conducts page-level analysis of the uploaded document. Each page is processed by the sdgBERT AI model trained to predict the first 16 '
223
+ 'Sustainable Development Goals (SDGs). The model analyzes the content and returns scores '
224
+ 'representing the likelihood that the text is aligned with particular SDGs. This page-level '
225
+ 'analysis provides high-level insight into SDG alignment.'
226
+ '\n\n'
227
+ 'Given that a page may align with more than one SDG, this app focuses on the top two SDG predictions '
228
+ '(Primary and Secondary) for each page with a probability score greater than zero.'
229
+ )
230
+
231
+ doc.add_heading("Primary SDGs Bar Graph", level=3)
232
+ doc.add_paragraph(
233
+ 'This graph displays the most essential SDG the AI model associates with pages. The bars '
234
+ 'represent the percentage of pages most strongly aligned with each SDG. This offers insight into the dominant '
235
+ 'sustainable development theme within the document.'
236
+ )
237
+
238
+ doc.add_heading("Secondary SDGs Bar Graph", level=3)
239
+ doc.add_paragraph(
240
+ 'This graph shows the second most relevant SDGs for pages. Although these SDGs are '
241
+ 'not the primary focus, the text has some relevance to these goals.'
242
+ )
243
+
244
+ for doc_name in df_pages['Document'].unique():
245
+ doc.add_heading(f"Document: {doc_name}", level=2)
246
+ df_doc = df_pages[df_pages['Document'] == doc_name]
247
+
248
+ # Generate and save graphs
249
+ first_sdg_plot_path = f"{doc_name}_first_sdg_page.jpeg"
250
+ second_sdg_plot_path = f"{doc_name}_second_sdg_page.jpeg"
251
+
252
+ plot_sdg(df_doc, "Primary SDGs", 'pred1').write_image(
253
+ first_sdg_plot_path, format='jpeg', scale=7, engine="kaleido")
254
+ plot_sdg(df_doc, "Secondary SDGs", 'pred2').write_image(
255
+ second_sdg_plot_path, format='jpeg', scale=7, engine="kaleido")
256
+
257
+ # Add plots to the Word document
258
+ doc.add_picture(first_sdg_plot_path, width=Inches(6))
259
+ doc.add_picture(second_sdg_plot_path, width=Inches(6))
260
+
261
+ doc.save("page_report.docx")
262
+ return "page_report.docx"
263
+
264
+ def generate_sentence_report(df_sentences):
265
+ doc = Document()
266
+ doc.add_heading("Sentence-Level SDG Analysis Report", 0)
267
+
268
+ doc.add_heading("General Notes", level=2)
269
+ doc.add_paragraph(
270
+ 'This app splits documents into sentences using a natural language processing algorithm. '
271
+ 'Each sentence is processed by the sdgBERT AI model trained to predict the first 16 '
272
+ 'Sustainable Development Goals (SDGs). The model analyzes the content and returns scores '
273
+ 'representing the likelihood that the text is aligned with particular SDGs. This sentence-level '
274
+ 'analysis provides deeper insight into SDG alignment.'
275
+ '\n\n'
276
+ 'Given that a sentence may align with more than one SDG, this app focuses on the top two SDG predictions '
277
+ '(Primary and Secondary) for each sentence with a probability score greater than zero.'
278
+ )
279
+
280
+ doc.add_heading("Primary SDGs Bar Graph", level=3)
281
+ doc.add_paragraph(
282
+ 'This graph displays the most essential SDG the AI model associates with sentences. The bars '
283
+ 'represent the percentage of sentences most strongly aligned with each SDG. This offers more profound insight '
284
+ 'into the dominant sustainable development theme within the document.'
285
+ )
286
+
287
+ doc.add_heading("Secondary SDGs Bar Graph", level=3)
288
+ doc.add_paragraph(
289
+ 'This graph shows the second most relevant SDGs for sentences. Although these SDGs are not '
290
+ 'the primary focus, the text has some relevance to these goals.'
291
+ )
292
+
293
+ for doc_name in df_sentences['Document'].unique():
294
+ doc.add_heading(f"Document: {doc_name}", level=2)
295
+ df_doc = df_sentences[df_sentences['Document'] == doc_name]
296
+
297
+ # Generate and save graphs
298
+ first_sdg_plot_path = f"{doc_name}_first_sdg_sentence.jpeg"
299
+ second_sdg_plot_path = f"{doc_name}_second_sdg_sentence.jpeg"
300
+
301
+ plot_sdg(df_doc, "Primary SDGs", 'pred1').write_image(
302
+ first_sdg_plot_path, format='jpeg', scale=7, engine="kaleido")
303
+ plot_sdg(df_doc, "Secondary SDGs", 'pred2').write_image(
304
+ second_sdg_plot_path, format='jpeg', scale=7, engine="kaleido")
305
+
306
+ # Add plots to the Word document
307
+ doc.add_picture(first_sdg_plot_path, width=Inches(6))
308
+ doc.add_picture(second_sdg_plot_path, width=Inches(6))
309
+
310
+ doc.save("sentence_report.docx")
311
+ return "sentence_report.docx"
312
+
313
+ # New text extraction functions
314
+ def extract_text_with_py_pdf_loader(pdf_file_path, start_page=None, end_page=None):
315
+ """
316
+ Extract text from a PDF page by page using LangChain's PyPDFLoader.
317
+ Args:
318
+ pdf_file_path (str): The file path to the uploaded PDF.
319
+ start_page (int, optional): The starting page number for extraction (1-based index).
320
+ end_page (int, optional): The ending page number for extraction (1-based index).
321
+ Returns:
322
+ tuple:
323
+ - page_df (pd.DataFrame): DataFrame containing Document, Page, and Text.
324
+ - sentence_df (pd.DataFrame): DataFrame containing Document, Page, and Sentence.
325
+ """
326
+ try:
327
+ # Initialize the loader
328
+ loader = PyPDFLoader(pdf_file_path)
329
+ documents = loader.load_and_split() # Each document corresponds to a single page
330
+
331
+ total_pages = len(documents)
332
+ doc_name = os.path.basename(pdf_file_path) # Extract document name
333
+
334
+ # Validate and adjust page range
335
+ if start_page is not None and end_page is not None:
336
+ # Convert to integers to avoid slicing issues
337
+ start_page = int(start_page)
338
+ end_page = int(end_page)
339
+
340
+ # Adjust to valid range
341
+ if start_page < 1:
342
+ start_page = 1
343
+ if end_page > total_pages:
344
+ end_page = total_pages
345
+ if start_page > end_page:
346
+ start_page, end_page = end_page, start_page # Swap if out of order
347
+
348
+ # Select the subset of documents based on user input
349
+ selected_docs = documents[start_page - 1:end_page]
350
+ else:
351
+ selected_docs = documents
352
+ start_page = 1
353
+ end_page = total_pages
354
+
355
+ # Initialize lists to store data
356
+ page_data = []
357
+ sentence_data = []
358
+
359
+ for idx, doc in enumerate(selected_docs, start=start_page):
360
+ page_num = idx
361
+ text = doc.page_content.strip()
362
+
363
+ # Append page-wise data
364
+ page_data.append({
365
+ "Document": doc_name,
366
+ "Page": page_num,
367
+ "Text": text
368
+ })
369
+
370
+ # Sentence tokenization
371
+ sentences = sent_tokenize(text)
372
+ for sentence in sentences:
373
+ sentence = sentence.strip()
374
+ if sentence:
375
+ sentence_data.append({
376
+ "Document": doc_name,
377
+ "Page": page_num,
378
+ "Sentence": sentence
379
+ })
380
+
381
+ # Create DataFrames
382
+ page_df = pd.DataFrame(page_data)
383
+ sentence_df = pd.DataFrame(sentence_data)
384
+
385
+ return page_df, sentence_df
386
+
387
+ except Exception as e:
388
+ raise RuntimeError(f"Error during PDF extraction: {e}")
389
+
390
+ def df_to_csv_bytes(df):
391
+ """
392
+ Convert DataFrame to CSV in bytes.
393
+ Args:
394
+ df (pd.DataFrame): The DataFrame to convert.
395
+ Returns:
396
+ bytes: CSV data in bytes.
397
+ """
398
+ try:
399
+ buffer = BytesIO()
400
+ df.to_csv(buffer, index=False)
401
+ csv_data = buffer.getvalue()
402
+ buffer.close()
403
+ return csv_data
404
+ except Exception as e:
405
+ raise RuntimeError(f"Error during CSV conversion: {e}")
406
+
407
+ def launch_interface():
408
+ with gr.Blocks(title="SDG Document Analysis App") as demo:
409
+
410
+ # Title as a visible heading at the top of the page
411
+ gr.Markdown(
412
+ """
413
+ # SDG Document Analysis App
414
+ Analyze documents to map Sustainable Development Goals (SDGs) at both page and sentence levels.
415
+ """
416
+ )
417
+
418
+ # Shared PDF file input for both analyses
419
+ with gr.Row():
420
+ file_input = gr.File(
421
+ label="Upload PDF File for Analysis", file_types=[".pdf"]
422
+ )
423
+
424
+ # Extraction mode selection
425
+ extraction_mode = gr.Radio(
426
+ choices=["All Pages", "Range of Pages"],
427
+ value="All Pages",
428
+ label="Extraction Mode"
429
+ )
430
+ start_page = gr.Number(value=1, label="Start Page", visible=False)
431
+ end_page = gr.Number(value=1, label="End Page", visible=False)
432
+
433
+ # Function to update visibility of start_page and end_page
434
+ def update_page_inputs(extraction_mode):
435
+ if extraction_mode == "Range of Pages":
436
+ return gr.update(visible=True), gr.update(visible=True)
437
+ else:
438
+ return gr.update(visible=False), gr.update(visible=False)
439
+
440
+ extraction_mode.change(
441
+ update_page_inputs,
442
+ inputs=extraction_mode,
443
+ outputs=[start_page, end_page]
444
+ )
445
+
446
+ # Tabs for page-level and sentence-level analysis
447
+ with gr.Tab("Page-Level Analysis"):
448
+ gr.Markdown(
449
+ """
450
+ ## Page-Level SDG Analysis
451
+ This section conducts Sustainable Development Goals (SDG) mapping
452
+ of documents using the [sdgBERT model](https://huggingface.co/sadickam/sdgBERT).
453
+ It provides **high-level SDG mapping** of documents at the page level.
454
+ """
455
+ )
456
+ with gr.Row():
457
+ with gr.Column():
458
+ primary_page_plot = gr.Plot(label="Primary SDGs [Page-Level]")
459
+ with gr.Column():
460
+ secondary_page_plot = gr.Plot(label="Secondary SDGs [Page-Level]")
461
+
462
+ with gr.Row():
463
+ page_csv = gr.File(label="Download Page Predictions CSV")
464
+ page_docx = gr.File(label="Download Page Report DOCX")
465
+ page_jpeg1 = gr.File(label="Download Primary SDGs JPEG")
466
+ page_jpeg2 = gr.File(label="Download Secondary SDGs JPEG")
467
+
468
+ page_button = gr.Button("Run Page-Level Analysis")
469
+ reset_page_button = gr.Button("Reset Page-Level Analysis")
470
+
471
+ with gr.Tab("Sentence-Level Analysis"):
472
+ gr.Markdown(
473
+ """
474
+ ## Sentence-Level SDG Analysis
475
+ This section conducts Sustainable Development Goals (SDG) mapping
476
+ using the [sdgBERT model](https://huggingface.co/sadickam/sdgBERT).
477
+ It provides **detailed SDG mapping** at the sentence level.
478
+ """
479
+ )
480
+ with gr.Row():
481
+ with gr.Column():
482
+ primary_sentence_plot = gr.Plot(label="Primary SDGs [Sentence-Level]")
483
+ with gr.Column():
484
+ secondary_sentence_plot = gr.Plot(label="Secondary SDGs [Sentence-Level]")
485
+
486
+ with gr.Row():
487
+ sentence_csv = gr.File(label="Download Sentence Predictions CSV")
488
+ sentence_docx = gr.File(label="Download Sentence Report DOCX")
489
+ sentence_jpeg1 = gr.File(label="Download Primary SDGs JPEG")
490
+ sentence_jpeg2 = gr.File(label="Download Secondary SDGs JPEG")
491
+
492
+ sentence_button = gr.Button("Run Sentence-Level Analysis")
493
+ reset_sentence_button = gr.Button("Reset Sentence-Level Analysis")
494
+
495
+ # Function to process page-level analysis
496
+ @spaces.GPU
497
+ def process_pages(file, extraction_mode, start_page, end_page):
498
+ if not file:
499
+ return None, None, None, None, None, None
500
+
501
+ try:
502
+ if hasattr(file, 'name'):
503
+ pdf_file_path = file.name
504
+ else:
505
+ # Save the file to a temporary location
506
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_pdf:
507
+ temp_pdf.write(file.read())
508
+ pdf_file_path = temp_pdf.name
509
+
510
+ # Determine page range based on extraction_mode
511
+ if extraction_mode == "All Pages":
512
+ selected_start = None
513
+ selected_end = None
514
+ else:
515
+ selected_start = int(start_page)
516
+ selected_end = int(end_page)
517
+
518
+ # Extract text and create DataFrames
519
+ page_df, _ = extract_text_with_py_pdf_loader(
520
+ pdf_file_path,
521
+ start_page=selected_start,
522
+ end_page=selected_end
523
+ )
524
+
525
+ # Predict SDGs at page level
526
+ df_page_predictions = predict_pages(page_df)
527
+
528
+ first_plot = plot_sdg(
529
+ df_page_predictions, "", 'pred1'
530
+ )
531
+ second_plot = plot_sdg(
532
+ df_page_predictions, "", 'pred2'
533
+ )
534
+
535
+ df_page_predictions.to_csv('page_predictions.csv', index=False)
536
+ page_report = generate_page_report(df_page_predictions)
537
+
538
+ # Save figures as JPEG
539
+ save_figure_as_jpeg(first_plot, "primary_page.jpeg")
540
+ save_figure_as_jpeg(second_plot, "secondary_page.jpeg")
541
+
542
+ return (
543
+ first_plot, second_plot, 'page_predictions.csv', page_report,
544
+ 'primary_page.jpeg', 'secondary_page.jpeg')
545
+
546
+ except Exception as e:
547
+ print(f"Error: {e}")
548
+ return None, None, None, None, None, None
549
+
550
+ # Function to process sentence-level analysis
551
+ @spaces.GPU
552
+ def process_sentences(file, extraction_mode, start_page, end_page):
553
+ if not file:
554
+ return None, None, None, None, None, None
555
+
556
+ try:
557
+ if hasattr(file, 'name'):
558
+ pdf_file_path = file.name
559
+ else:
560
+ # Save the file to a temporary location
561
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_pdf:
562
+ temp_pdf.write(file.read())
563
+ pdf_file_path = temp_pdf.name
564
+
565
+ # Determine page range based on extraction_mode
566
+ if extraction_mode == "All Pages":
567
+ selected_start = None
568
+ selected_end = None
569
+ else:
570
+ selected_start = int(start_page)
571
+ selected_end = int(end_page)
572
+
573
+ # Extract text and create DataFrames
574
+ _, sentence_df = extract_text_with_py_pdf_loader(
575
+ pdf_file_path,
576
+ start_page=selected_start,
577
+ end_page=selected_end
578
+ )
579
+
580
+ # Predict SDGs at sentence level
581
+ df_sentence_predictions = predict_sentences(sentence_df)
582
+
583
+ first_plot = plot_sdg(
584
+ df_sentence_predictions, "", 'pred1'
585
+ )
586
+ second_plot = plot_sdg(
587
+ df_sentence_predictions, "", 'pred2'
588
+ )
589
+
590
+ df_sentence_predictions.to_csv('sentence_predictions.csv', index=False)
591
+ sentence_report = generate_sentence_report(df_sentence_predictions)
592
+
593
+ # Save figures as JPEG
594
+ save_figure_as_jpeg(first_plot, "primary_sentence.jpeg")
595
+ save_figure_as_jpeg(second_plot, "secondary_sentence.jpeg")
596
+
597
+ return (
598
+ first_plot, second_plot, 'sentence_predictions.csv', sentence_report,
599
+ 'primary_sentence.jpeg', 'secondary_sentence.jpeg')
600
+
601
+ except Exception as e:
602
+ print(f"Error: {e}")
603
+ return None, None, None, None, None, None
604
+
605
+ # Reset functions to clear the outputs
606
+ def reset_page_outputs():
607
+ return None, None, None, None, None, None
608
+
609
+ def reset_sentence_outputs():
610
+ return None, None, None, None, None, None
611
+
612
+ # Button actions for each tab
613
+ page_button.click(
614
+ process_pages,
615
+ inputs=[file_input, extraction_mode, start_page, end_page],
616
+ outputs=[primary_page_plot, secondary_page_plot, page_csv, page_docx,
617
+ page_jpeg1, page_jpeg2]
618
+ )
619
+
620
+ sentence_button.click(
621
+ process_sentences,
622
+ inputs=[file_input, extraction_mode, start_page, end_page],
623
+ outputs=[primary_sentence_plot, secondary_sentence_plot, sentence_csv, sentence_docx,
624
+ sentence_jpeg1, sentence_jpeg2]
625
+ )
626
+
627
+ # Reset button actions to clear outputs
628
+ reset_page_button.click(
629
+ reset_page_outputs,
630
+ outputs=[primary_page_plot, secondary_page_plot, page_csv, page_docx,
631
+ page_jpeg1, page_jpeg2]
632
+ )
633
+
634
+ reset_sentence_button.click(
635
+ reset_sentence_outputs,
636
+ outputs=[primary_sentence_plot, secondary_sentence_plot, sentence_csv, sentence_docx,
637
+ sentence_jpeg1, sentence_jpeg2]
638
+ )
639
+
640
+ demo.queue().launch()
641
+
642
+ launch_interface()