File size: 38,306 Bytes
5bc0ffd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5e20a29
 
5bc0ffd
 
 
 
 
 
 
 
00b2fb8
 
 
 
 
 
 
 
 
 
 
 
5bc0ffd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
00b2fb8
 
5bc0ffd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
00b2fb8
 
5bc0ffd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8dbd1eb
a1cd214
51eb07c
f4ac013
a1cd214
 
5bc0ffd
f4ac013
 
 
51eb07c
 
f4ac013
a1cd214
f4ac013
 
5bc0ffd
 
 
 
 
 
f4ac013
5bc0ffd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8d448c9
5bc0ffd
 
 
 
 
 
 
 
 
51eb07c
 
5bc0ffd
9e7371c
5bc0ffd
51eb07c
5bc0ffd
 
a1cd214
51eb07c
5cd89cd
a1cd214
5bc0ffd
 
f4ac013
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a7b3132
a1cd214
 
 
f4ac013
 
 
 
 
 
5bc0ffd
 
 
 
51eb07c
5bc0ffd
 
d57c3ca
5bc0ffd
 
 
d57c3ca
5bc0ffd
 
 
 
 
 
 
 
 
 
62aa076
5bc0ffd
 
 
 
 
 
62aa076
5bc0ffd
 
 
 
 
 
d57c3ca
 
 
 
5bc0ffd
 
 
d57c3ca
 
5bc0ffd
51eb07c
 
 
 
 
 
 
 
 
 
1e0c688
5bc0ffd
 
 
 
 
d57c3ca
 
5bc0ffd
d57c3ca
5bc0ffd
 
 
d57c3ca
5bc0ffd
 
 
 
 
 
 
 
 
 
 
62aa076
5bc0ffd
 
 
 
 
 
62aa076
5bc0ffd
 
 
 
 
 
d57c3ca
 
 
 
5bc0ffd
 
 
d57c3ca
 
5bc0ffd
51eb07c
 
 
 
 
 
 
 
 
 
 
5bc0ffd
 
 
 
 
d57c3ca
 
5bc0ffd
00b2fb8
5bc0ffd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
00b2fb8
 
 
 
 
 
 
5bc0ffd
 
 
 
00b2fb8
5bc0ffd
 
 
00b2fb8
5bc0ffd
 
5e20a29
5bc0ffd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8dbd1eb
5bc0ffd
00b2fb8
5bc0ffd
 
28852e9
5bc0ffd
 
 
 
 
62aa076
5bc0ffd
 
d57c3ca
5bc0ffd
 
00b2fb8
 
 
8dbd1eb
62aa076
 
00b2fb8
5bc0ffd
00b2fb8
 
 
 
 
 
 
 
62aa076
4539ed0
5bc0ffd
 
 
 
 
 
 
 
 
 
 
 
 
 
be92a53
a1cd214
 
d57c3ca
5bc0ffd
 
8dbd1eb
5bc0ffd
 
 
 
 
62aa076
 
 
 
 
8dbd1eb
 
eda1b2b
62aa076
eda1b2b
dac7577
61b0e3a
62aa076
235f490
a1cd214
62aa076
a1cd214
eda1b2b
8dbd1eb
 
 
 
 
eda1b2b
62aa076
eda1b2b
dac7577
61b0e3a
62aa076
235f490
a1cd214
62aa076
454e4fc
eda1b2b
8dbd1eb
 
 
62aa076
5bc0ffd
d57c3ca
5bc0ffd
 
8dbd1eb
5bc0ffd
 
8dbd1eb
5bc0ffd
 
62aa076
 
 
 
 
8dbd1eb
 
eda1b2b
62aa076
eda1b2b
dac7577
61b0e3a
62aa076
235f490
a1cd214
62aa076
a1cd214
eda1b2b
8dbd1eb
 
 
 
 
eda1b2b
454e4fc
eda1b2b
dac7577
454e4fc
 
235f490
a1cd214
454e4fc
a1cd214
eda1b2b
8dbd1eb
 
 
 
62aa076
5bc0ffd
 
 
 
8dbd1eb
a1cd214
5bc0ffd
 
 
 
d57c3ca
5bc0ffd
 
 
 
 
d57c3ca
 
 
 
 
5bc0ffd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8dbd1eb
5bc0ffd
51eb07c
 
 
 
5bc0ffd
 
51eb07c
 
 
 
5bc0ffd
 
d57c3ca
62aa076
 
 
8dbd1eb
a1cd214
62aa076
 
d57c3ca
8dbd1eb
d57c3ca
8dbd1eb
 
 
 
5bc0ffd
 
d57c3ca
 
5bc0ffd
 
a1cd214
 
 
8dbd1eb
5bc0ffd
 
 
a1cd214
5bc0ffd
 
 
 
 
8dbd1eb
a1cd214
5bc0ffd
 
 
 
d57c3ca
5bc0ffd
 
 
 
 
d57c3ca
 
 
 
 
5bc0ffd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8dbd1eb
5bc0ffd
51eb07c
 
 
 
5bc0ffd
 
51eb07c
 
 
 
5bc0ffd
 
d57c3ca
62aa076
 
 
8dbd1eb
a1cd214
62aa076
 
d57c3ca
8dbd1eb
d57c3ca
8dbd1eb
 
 
 
5bc0ffd
 
d57c3ca
 
5bc0ffd
 
a1cd214
 
 
8dbd1eb
5bc0ffd
 
 
a1cd214
5bc0ffd
 
 
a1cd214
5bc0ffd
 
a1cd214
5bc0ffd
8dbd1eb
5bc0ffd
 
 
8dbd1eb
 
 
 
 
 
2189bdb
62aa076
8dbd1eb
 
 
 
 
 
 
 
 
 
 
 
2189bdb
8dbd1eb
 
 
5bc0ffd
 
8dbd1eb
5bc0ffd
 
 
8dbd1eb
 
 
 
 
 
2189bdb
62aa076
8dbd1eb
 
5bc0ffd
 
 
 
8dbd1eb
 
 
 
 
 
2189bdb
8dbd1eb
 
 
5bc0ffd
 
 
 
a1cd214
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
import gradio as gr
import os
import re
import torch
import pandas as pd
import plotly.express as px
import plotly.io as pio
import nltk
import tempfile
from io import BytesIO
import base64
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from nltk.tokenize import sent_tokenize
from docx.shared import Inches
from docx import Document
import numpy as np
# Needed for HF GPU access
import spaces

from styles import custom_css  # Importing custom CSS

nltk.download('punkt')

# Import PyPDFLoader for PDF processing
from langchain_community.document_loaders import PyPDFLoader

# Model checkpoint for SDG BERT
checkpoint = "sadickam/sdgBERT"

# Text cleaning function
def clean_text(text):
    """
    Cleans the extracted text by removing irrelevant characters but retains currency symbols.
    """
    text = text.strip()
    # Define the allowed characters (including currency symbols)
    allowed_chars = r'[^a-zA-Z0-9\s\.,!?$€£Β₯β‚ΉΒ’β‚©]'
    text = re.sub(allowed_chars, '', text)
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    return text

# Preprocessing function for text
def prep_text(text):
    clean_sents = []
    sent_tokens = sent_tokenize(str(text))
    for sent_token in sent_tokens:
        word_tokens = [str(word_token).strip().lower() for word_token in sent_token.split()]
        clean_sents.append(' '.join(word_tokens))
    joined = ' '.join(clean_sents).strip()
    return re.sub(r'`|"', "", joined)

# Load the tokenizer and model with GPU support
def load_model_and_tokenizer():
    model = AutoModelForSequenceClassification.from_pretrained(checkpoint).to(device)
    tokenizer = AutoTokenizer.from_pretrained(checkpoint)
    return model, tokenizer

# Define device (ensure usage of GPU if available in Hugging Face Spaces)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# SDG labels
label_list = [
    'SDG1_No Poverty', 'SDG2_Zero Hunger', 'SDG3_Good Health and Well-being', 'SDG4_Quality Education',
    'SDG5_Gender Equality', 'SDG6_Clean Water and Sanitation', 'SDG7_Affordable and Clean Energy',
    'SDG8_Decent Work and Economic Growth', 'SDG9_Industry, Innovation and Infrastructure',
    'SDG10_Reduced Inequality', 'SDG11_Sustainable Cities and Communities',
    'SDG12_Responsible Consumption and Production', 'SDG13_Climate Action',
    'SDG14_Life Below Water', 'SDG15_Life on Land', 'SDG16_Peace, Justice and Strong Institutions'
]

# Function to predict SDGs for a batch of text inputs
def predict_sdg_labels_batch(texts, model, tokenizer):
    tokenized_texts = tokenizer(texts, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
    model.eval()
    with torch.no_grad():
        text_logits = model(**tokenized_texts).logits
    predictions = torch.softmax(text_logits, dim=1).tolist()
    return predictions

# Page-level predictions with batch processing
def predict_pages(page_df, batch_size=32):
    model, tokenizer = load_model_and_tokenizer()
    df_results = page_df.copy()
    num_rows = len(page_df)
    all_predicted_labels = [[] for _ in range(16)]
    all_prediction_scores = [[] for _ in range(16)]

    for start in range(0, num_rows, batch_size):
        end = min(start + batch_size, num_rows)
        df_chunk = page_df.iloc[start:end]
        # Clean text
        texts = df_chunk['Text'].apply(clean_text).apply(prep_text).tolist()
        predictions_batch = predict_sdg_labels_batch(texts, model, tokenizer)
        for predictions in predictions_batch:
            sorted_preds = sorted(zip(label_list, predictions), key=lambda x: x[1], reverse=True)
            for i, (label, score) in enumerate(sorted_preds):
                all_predicted_labels[i].append(label)
                all_prediction_scores[i].append(score)

    # Add columns to the DataFrame in the desired order (pred1, score1, pred2, score2, ...)
    for i in range(16):
        df_results[f'pred{i + 1}'] = all_predicted_labels[i]
        df_results[f'score{i + 1}'] = all_prediction_scores[i]

    # Reorder columns to ensure preds and scores are interleaved in the correct order
    reordered_columns = []
    for i in range(16):
        reordered_columns.append(f'pred{i + 1}')
        reordered_columns.append(f'score{i + 1}')
    other_columns = [col for col in df_results.columns if col not in reordered_columns]
    df_results = df_results[other_columns + reordered_columns]

    return df_results

# Sentence-level predictions with batch processing
def predict_sentences(sentence_df, batch_size=32):
    model, tokenizer = load_model_and_tokenizer()
    df_combined_sentences = sentence_df.copy()

    num_rows = len(sentence_df)
    all_predicted_labels = [[] for _ in range(16)]
    all_prediction_scores = [[] for _ in range(16)]

    for start in range(0, num_rows, batch_size):
        end = min(start + batch_size, num_rows)
        df_chunk = sentence_df.iloc[start:end]
        # Clean text
        texts = df_chunk['Sentence'].apply(clean_text).apply(prep_text).tolist()
        predictions_batch = predict_sdg_labels_batch(texts, model, tokenizer)
        for predictions in predictions_batch:
            sorted_preds = sorted(zip(label_list, predictions), key=lambda x: x[1], reverse=True)
            for i, (label, score) in enumerate(sorted_preds):
                all_predicted_labels[i].append(label)
                all_prediction_scores[i].append(round(score, 3))

    # Add predictions and scores to DataFrame
    for i in range(16):
        df_combined_sentences[f'pred{i + 1}'] = all_predicted_labels[i]
        df_combined_sentences[f'score{i + 1}'] = all_prediction_scores[i]

    # Reorder columns
    reordered_columns = []
    for i in range(16):
        reordered_columns.append(f'pred{i + 1}')
        reordered_columns.append(f'score{i + 1}')
    other_columns = [col for col in df_combined_sentences.columns if col not in reordered_columns]
    df_combined_sentences = df_combined_sentences[other_columns + reordered_columns]

    return df_combined_sentences

# Define unique colors for each SDG
sdg_colors = {
    "SDG1_No Poverty": "#E5243B",
    "SDG2_Zero Hunger": "#DDA63A",
    "SDG3_Good Health and Well-being": "#4C9F38",
    "SDG4_Quality Education": "#C5192D",
    "SDG5_Gender Equality": "#FF3A21",
    "SDG6_Clean Water and Sanitation": "#26BDE2",
    "SDG7_Affordable and Clean Energy": "#FCC30B",
    "SDG8_Decent Work and Economic Growth": "#A21942",
    "SDG9_Industry, Innovation and Infrastructure": "#FD6925",
    "SDG10_Reduced Inequality": "#DD1367",
    "SDG11_Sustainable Cities and Communities": "#FD9D24",
    "SDG12_Responsible Consumption and Production": "#BF8B2E",
    "SDG13_Climate Action": "#3F7E44",
    "SDG14_Life Below Water": "#0A97D9",
    "SDG15_Life on Land": "#56C02B",
    "SDG16_Peace, Justice and Strong Institutions": "#00689D"
}

# Function to plot SDG dominant bar graphs using Plotly
# Function to plot SDG dominant bar graphs using Plotly
def plot_sdg(df, title, pred_column, x_axis_title=None, y_axis_title=None, icons_folder='assets/icons/'):
    """
    Plots a horizontal bar graph of SDG predictions and superimposes the icon of the most frequent SDG.
    
    Args:
        df (pd.DataFrame): DataFrame containing SDG predictions.
        title (str): Title of the plot.
        pred_column (str): Column name to use for plotting (e.g., 'pred1').
        x_axis_title (str): Title for the x-axis.
        y_axis_title (str): Title for the y-axis.
        icons_folder (str): Path to the folder containing SDG icons.
    
    Returns:
        plotly.graph_objs._figure.Figure: The Plotly figure object.
    """
    df_filtered = df[df[pred_column].notna()]
    labels = df_filtered[pred_column].value_counts().sort_values(ascending=False)
    total = labels.sum()
    percentages = (labels / total) * 100

    # Create a horizontal bar plot with Plotly
    fig = px.bar(
        percentages.rename_axis('SDG Label').reset_index(name='Percentage'),
        y='SDG Label',
        x='Percentage',
        orientation='h',
        title=title,
        color='SDG Label',
        color_discrete_map=sdg_colors  # Use the defined unique colors for each SDG
    )

    # Update y-axis to show labels
    fig.update_yaxes(showticklabels=True)

    # Add percentage labels to the bars
    fig.update_traces(
        texttemplate='%{x:.2f}%',
        textposition='auto',
        textfont=dict(size=11)
    )

    # Adjust layout for better visibility
    fig.update_layout(
        title=dict(
            text=title, font=dict(size=14)  # Increase title font size
        ),
        yaxis=dict(
            automargin=True,
            title=y_axis_title,
            tickfont=dict(size=13)
        ),
        margin=dict(l=20, r=100, t=30, b=20),  # Increased right margin for icon
        height=600,
        #width=800,
        showlegend=False,
        template="simple_white",
        xaxis=dict(
            title=x_axis_title,
            tickfont=dict(size=13)  # Reduce x-axis font size
        ),
    )

    # Identify the most frequent SDG
    if not percentages.empty:
        top_sdg_label = percentages.index[0]  # e.g., 'SDG1_No Poverty'

        # Map SDG label to icon filename
        # Assuming naming convention 'SDG1.png', 'SDG2.png', etc.
        sdg_number = top_sdg_label.split('_')[0]  # Extract 'SDG1'
        icon_filename = f"{sdg_number}.png"  # e.g., 'SDG1.png'
        icon_path = os.path.join(icons_folder, icon_filename)

        # Check if the icon file exists
        if os.path.exists(icon_path):
            # Read and encode the image
            with open(icon_path, 'rb') as image_file:
                encoded_image = base64.b64encode(image_file.read()).decode('utf-8')

            # Add the icon as an image in the Plotly figure
            fig.add_layout_image(
                dict(
                    source='data:image/png;base64,' + encoded_image,
                    xref="paper", yref="paper",
                    x=.98, y=1.0,  # Positioning: slightly to the right and top
                    sizex=0.2, sizey=0.2,  # Size of the icon
                    xanchor="left",
                    yanchor="top",
                    layer="above"  # Ensure the icon is above other plot elements
                )
            )
        else:
            print(f"Icon file '{icon_path}' not found. Skipping icon overlay.")

    return fig

def save_figure_as_jpeg(fig, filename):
    """Saves the Plotly figure as a high-resolution JPEG."""
    pio.write_image(fig, filename, format='jpeg', width=700, height=650, scale=7, engine="kaleido")

# Generate reports (page and sentence levels)
def generate_page_report(df_pages, report_file_name):
    doc = Document()
    doc.add_heading("Page-Level SDG Analysis Report", 0)

    doc.add_heading("πŸ“‹ General Notes", level=2)
    doc.add_paragraph(
        'This app conducts page-level analysis of the uploaded document. Each page is processed by the sdgBERT AI model trained to predict the first 16 '
        'Sustainable Development Goals (SDGs). The model analyzes the content and returns scores '
        'representing the likelihood that the text is aligned with particular SDGs. This page-level '
        'analysis provides high-level insight into SDG alignment.'
        '\n\n'
        'Given that a page may align with more than one SDG, this app focuses on the top two SDG predictions '
        '(Primary and Secondary) for each page with a probability score greater than zero.'
    )

    doc.add_heading("Primary SDGs Bar Graph", level=3)
    doc.add_paragraph(
        'This graph displays the most essential SDG the AI model associates with pages. The bars '
        'represent the percentage of pages most strongly aligned with each SDG. This offers insight into the dominant '
        'sustainable development theme within the document.'
    )

    doc.add_heading("Secondary SDGs Bar Graph", level=3)
    doc.add_paragraph(
        'This graph shows the second most relevant SDGs for pages. Although these SDGs are '
        'not the primary focus, the text has some relevance to these goals.'
    )
    
    for doc_name in df_pages['Document'].unique():
        # Sanitize doc_name to use in file names
        sanitized_doc_name = re.sub(r'[^\w\-]', '_', os.path.splitext(doc_name)[0])

        doc.add_heading(f"πŸ“„ Document: {doc_name}", level=2)
        df_doc = df_pages[df_pages['Document'] == doc_name]

        # Generate and save graphs
        first_sdg_plot_path = f"{sanitized_doc_name}_first_sdg_page.jpeg"
        second_sdg_plot_path = f"{sanitized_doc_name}_second_sdg_page.jpeg"

        plot_sdg(
            df_doc, "", 'pred1',
            x_axis_title="Percentage (%) of aligned pages",
            y_axis_title="Primary SDGs"
        ).write_image(first_sdg_plot_path, format='jpeg', scale=7, engine="kaleido")
        
        plot_sdg(
            df_doc, "", 'pred2',
            x_axis_title="Percentage (%) of aligned pages",
            y_axis_title="Secondary SDGs"
        ).write_image(second_sdg_plot_path, format='jpeg', scale=7, engine="kaleido")

        # Add plots to the Word document
        doc.add_picture(first_sdg_plot_path, width=Inches(6))
        doc.add_picture(second_sdg_plot_path, width=Inches(6))

    doc.save(report_file_name)
    return report_file_name

def generate_sentence_report(df_sentences, report_file_name):
    doc = Document()
    doc.add_heading("Sentence-Level SDG Analysis Report", 0)

    doc.add_heading("πŸ“‹ General Notes", level=2)
    doc.add_paragraph(
        'This app splits documents into sentences using a natural language processing algorithm. '
        'Each sentence is processed by the sdgBERT AI model trained to predict the first 16 '
        'Sustainable Development Goals (SDGs). The model analyzes the content and returns scores '
        'representing the likelihood that the text is aligned with particular SDGs. This sentence-level '
        'analysis provides deeper insight into SDG alignment.'
        '\n\n'
        'Given that a sentence may align with more than one SDG, this app focuses on the top two SDG predictions '
        '(Primary and Secondary) for each sentence with a probability score greater than zero.'
    )

    doc.add_heading("Primary SDGs Bar Graph", level=3)
    doc.add_paragraph(
        'This graph displays the most essential SDG the AI model associates with sentences. The bars '
        'represent the percentage of sentences most strongly aligned with each SDG. This offers more profound insight '
        'into the dominant sustainable development theme within the document.'
    )

    doc.add_heading("Secondary SDGs Bar Graph", level=3)
    doc.add_paragraph(
        'This graph shows the second most relevant SDGs for sentences. Although these SDGs are not '
        'the primary focus, the text has some relevance to these goals.'
    )

    for doc_name in df_sentences['Document'].unique():
        # Sanitize doc_name to use in file names
        sanitized_doc_name = re.sub(r'[^\w\-]', '_', os.path.splitext(doc_name)[0])

        doc.add_heading(f"πŸ“„ Document: {doc_name}", level=2)
        df_doc = df_sentences[df_sentences['Document'] == doc_name]

        # Generate and save graphs
        first_sdg_plot_path = f"{sanitized_doc_name}_first_sdg_sentence.jpeg"
        second_sdg_plot_path = f"{sanitized_doc_name}_second_sdg_sentence.jpeg"

        plot_sdg(
            df_doc, "", 'pred1',
            x_axis_title="Percentage (%) of aligned sentences",
            y_axis_title="Primary SDGs"
        ).write_image(first_sdg_plot_path, format='jpeg', scale=7, engine="kaleido")

        plot_sdg(
            df_doc, "", 'pred2',
            x_axis_title="Percentage (%) of aligned sentences",
            y_axis_title="Secondary SDGs"
        ).write_image(second_sdg_plot_path, format='jpeg', scale=7, engine="kaleido")

        # Add plots to the Word document
        doc.add_picture(first_sdg_plot_path, width=Inches(6))
        doc.add_picture(second_sdg_plot_path, width=Inches(6))

    doc.save(report_file_name)
    return report_file_name

# New text extraction functions with text cleaning and line joining
def extract_text_with_py_pdf_loader(pdf_file_path, start_page=None, end_page=None):
    """
    Extract text from a PDF page by page using LangChain's PyPDFLoader.
    Args:
        pdf_file_path (str): The file path to the uploaded PDF.
        start_page (int, optional): The starting page number for extraction (1-based index).
        end_page (int, optional): The ending page number for extraction (1-based index).
    Returns:
        tuple: 
            - page_df (pd.DataFrame): DataFrame containing Document, Page, and Text.
            - sentence_df (pd.DataFrame): DataFrame containing Document, Page, and Sentence.
    """
    try:
        # Initialize the loader
        loader = PyPDFLoader(pdf_file_path)
        documents = loader.load_and_split()  # Each document corresponds to a single page

        total_pages = len(documents)
        doc_name = os.path.basename(pdf_file_path)  # Extract document name

        # Validate and adjust page range
        if start_page is not None and end_page is not None:
            # Convert to integers to avoid slicing issues
            start_page = int(start_page)
            end_page = int(end_page)

            # Adjust to valid range
            if start_page < 1:
                start_page = 1
            if end_page > total_pages:
                end_page = total_pages
            if start_page > end_page:
                start_page, end_page = end_page, start_page  # Swap if out of order

            # Select the subset of documents based on user input
            selected_docs = documents[start_page - 1:end_page]
        else:
            selected_docs = documents
            start_page = 1
            end_page = total_pages

        # Initialize lists to store data
        page_data = []
        sentence_data = []

        for idx, doc in enumerate(selected_docs, start=start_page):
            page_num = idx
            text = doc.page_content.strip()

            # Join lines that belong to the same sentence
            lines = text.split('\n')
            joined_text = ' '.join(line.strip() for line in lines if line.strip())

            # Clean text
            cleaned_text = clean_text(joined_text)

            # Append page-wise data
            page_data.append({
                "Document": doc_name,
                "Page": page_num,
                "Text": cleaned_text
            })

            # Sentence tokenization
            sentences = sent_tokenize(cleaned_text)
            for sentence in sentences:
                sentence = sentence.strip()
                if sentence and len(sentence) > 70:
                    sentence_data.append({
                        "Document": doc_name,
                        "Page": page_num,
                        "Sentence": sentence
                    })

        # Create DataFrames
        page_df = pd.DataFrame(page_data)
        sentence_df = pd.DataFrame(sentence_data)

        return page_df, sentence_df

    except Exception as e:
        raise RuntimeError(f"Error during PDF extraction: {e}")

def df_to_csv_bytes(df):
    """
    Convert DataFrame to CSV in bytes.
    Args:
        df (pd.DataFrame): The DataFrame to convert.
    Returns:
        bytes: CSV data in bytes.
    """
    try:
        buffer = BytesIO()
        df.to_csv(buffer, index=False)
        csv_data = buffer.getvalue()
        buffer.close()
        return csv_data
    except Exception as e:
        raise RuntimeError(f"Error during CSV conversion: {e}")

def launch_interface():
    with gr.Blocks(css=custom_css) as demo:

        # Title as a visible heading at the top of the page with an icon
        gr.Markdown(
            """
            # 🌍 SDG Document Analysis App  
            Analyze documents to map Sustainable Development Goals (SDGs) at both page and sentence levels.
            """
        )

        # Shared PDF file input for both analyses
        gr.Markdown("## Upload PDF File")
        with gr.Row():
            file_input = gr.File(
                label="πŸ“ Upload PDF File for Analysis", file_types=[".pdf"]
            )

        # Extraction mode selection with explanatory text
        gr.Markdown(
            """
            ## PDF Text Extraction Mode  
            Choose whether to analyze all pages or a specific range of pages. If you want to exclude certain pages from the analysis, select 
            "Range of Pages" and specify the start and end pages. 
            """
        )
        with gr.Row():
            extraction_mode = gr.Radio(
                choices=["All Pages", "Range of Pages"],
                value="All Pages",
                label="Extraction Mode"
            )

        with gr.Row():
            start_page = gr.Number(value=1, label="πŸ”’ Start Page", visible=False, info="The cover page is page 1")
            end_page = gr.Number(value=1, label="πŸ”’ End Page", visible=False)

        # Function to update visibility of start_page and end_page
        def update_page_inputs(extraction_mode):
            if extraction_mode == "Range of Pages":
                return gr.update(visible=True), gr.update(visible=True)
            else:
                return gr.update(visible=False), gr.update(visible=False)

        extraction_mode.change(
            update_page_inputs,
            inputs=extraction_mode,
            outputs=[start_page, end_page]
        )

        # Main Tabs for Page-Level and Sentence-Level Analysis
        gr.Markdown("## SDG Analysis Type")
        
        with gr.Tab("πŸ“„ Page-Level Analysis"):
            gr.Markdown(
                """
                ### Page-Level SDG Analysis  
                This section conducts Sustainable Development Goals (SDG) mapping 
                of documents using the [sdgBERT model](https://huggingface.co/sadickam/sdgBERT). 
                It provides **high-level SDG mapping** of documents at the page level.
                """
            )

            with gr.Row():
                page_button = gr.Button("πŸƒβ€β™‚οΈ Run Page-Level Analysis")
                reset_page_button = gr.Button("πŸ”„ Reset Page-Level Analysis", elem_classes="reset-button")

            # Nested Tabs for Primary and Secondary SDGs
            with gr.Tabs():
                with gr.TabItem("πŸ“Š Primary SDGs"):
                    with gr.Row():
                        primary_page_plot = gr.Plot(label="πŸ“Š Primary SDGs Graph [Page-Level]", scale=2)
                        gr.Markdown(
                            "When the analysis is done, the Primary SDGs bar graph on the left will show "+
                            "the percentage of pages that strongly align with each SDG. The icon for the most frequent "+
                            "SDG will be highlighted on the graph. Download the Page Predictions CSV for further details.",
                            label = "Note", container=True
                        )
                        
                    gr.Markdown("##### Download Results")    
                    with gr.Row():
                        page_csv = gr.File(label="πŸ“Š Download Page Predictions CSV")
                        page_docx = gr.File(label="πŸ“„ Download Page Report DOCX")
                        page_jpeg1 = gr.File(label="πŸ–ΌοΈ Download Primary SDGs JPEG")

                with gr.TabItem("πŸ“ˆ Secondary SDGs"):
                    with gr.Row():
                        secondary_page_plot = gr.Plot(label="πŸ“ˆ Secondary SDGs Graph [Page-Level]", scale=2)
                        gr.Markdown(
                            "When the analysis is done, the Secondary SDGs bar graph on the left will show "+
                            "SDGs that are not the primary focus of the pages analysed. These SDGs are second to the "+
                            "Primary SDGs. Download the Sentence Predictions CSV for further details",
                            label = "Note", container=True
                        )
                    
                    gr.Markdown("##### Download Results") 
                    with gr.Row():
                        page_csv_secondary = gr.File(label="πŸ“Š Download Page Predictions CSV")
                        page_report_file_secondary = gr.File(label="πŸ“„ Download Page Report DOCX")
                        secondary_page_jpeg = gr.File(label="πŸ–ΌοΈ Download Secondary SDGs JPEG")    

        with gr.Tab("✍️ Sentence-Level Analysis"):
            gr.Markdown(
                """
                ### Sentence-Level SDG Analysis  
                This section conducts Sustainable Development Goals (SDG) mapping 
                using the [sdgBERT model](https://huggingface.co/sadickam/sdgBERT). 
                It provides **detailed SDG mapping** at the sentence level.
                """
            )

            with gr.Row():
                sentence_button = gr.Button("πŸƒβ€β™‚οΈ Run Sentence-Level Analysis")
                reset_sentence_button = gr.Button("πŸ”„ Reset Sentence-Level Analysis", elem_classes="reset-button")

            # Nested Tabs for Primary and Secondary SDGs
            with gr.Tabs():
                with gr.TabItem("πŸ“Š Primary SDGs"):
                    with gr.Row():
                        primary_sentence_plot = gr.Plot(label="πŸ“Š Primary SDGs Graph [Sentence-Level]", scale=2)
                        gr.Markdown(
                            "When the analysis is done, the Primary SDGs bar graph on the left will show "+
                            "the percentage of sentences that strongly align with each SDG. The icon for the most frequent "+
                            "SDG will be highlighted on the graph. Download the Sentence Predictions CSV for further details.",
                            label = "Note", container=True
                        )
                        
                    gr.Markdown("##### Download Results")  
                    with gr.Row():
                        sentence_csv = gr.File(label="πŸ“Š Download Sentence Predictions CSV")
                        sentence_docx = gr.File(label="πŸ“„ Download Sentence Report DOCX")
                        sentence_jpeg1 = gr.File(label="πŸ–ΌοΈ Download Primary SDGs JPEG")

                with gr.TabItem("πŸ“ˆ Secondary SDGs"):
                    with gr.Row():
                        secondary_sentence_plot = gr.Plot(label="πŸ“ˆ Secondary SDGs Graph [Sentence-Level]", scale=2)
                        gr.Markdown(
                            "When the analysis is done, the Secondary SDGs bar graph on the left will show "+
                            "SDGs that are not the primary focus of the sentences analysed. These SDGs are second to the "+
                            "Primary SDGs. Download the Sentence Predictions CSV for further details",
                            label = "Note", container=True
                        )
                        
                    gr.Markdown("##### Download Results")  
                    with gr.Row():
                        sentence_csv_secondary = gr.File(label="πŸ“Š Download Sentence Predictions CSV")
                        sentence_report_file_secondary = gr.File(label="πŸ“„ Download Sentence Report DOCX")
                        secondary_sentence_jpeg = gr.File(label="πŸ–ΌοΈ Download Secondary SDGs JPEG")
            
        # Function to process page-level analysis
        @spaces.GPU
        def process_pages(file, extraction_mode, start_page, end_page):
            if not file:
                # Return None for each output component
                return [None, None, None, None, None, None, None, None]

            try:
                if hasattr(file, 'name'):
                    pdf_file_path = file.name
                    original_file_name = os.path.basename(file.name)
                else:
                    # Save the file to a temporary location
                    with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_pdf:
                        temp_pdf.write(file.read())
                        pdf_file_path = temp_pdf.name
                    original_file_name = 'uploaded_document'

                # Sanitize the file name to use in output file names
                sanitized_file_name = os.path.splitext(original_file_name)[0]
                sanitized_file_name = re.sub(r'[^\w\-]', '_', sanitized_file_name)

                # Determine page range based on extraction_mode
                if extraction_mode == "All Pages":
                    selected_start = None
                    selected_end = None
                else:
                    selected_start = int(start_page)
                    selected_end = int(end_page)

                # Extract text and create DataFrames
                page_df, _ = extract_text_with_py_pdf_loader(
                    pdf_file_path,
                    start_page=selected_start,
                    end_page=selected_end
                )

                # Predict SDGs at page level
                df_page_predictions = predict_pages(page_df)

                # Generate plots with icon overlay
                first_plot = plot_sdg(
                    df_page_predictions, "",
                    'pred1',
                    x_axis_title="Percentage (%) of aligned pages",
                    y_axis_title="Primary SDGs"
                )
                second_plot = plot_sdg(
                    df_page_predictions, "",
                    'pred2',
                    x_axis_title="Percentage (%) of aligned pages",
                    y_axis_title="Secondary SDGs"
                )

                # Define output file names
                page_csv_file = f"{sanitized_file_name}_SDG-Page_predictions.csv"
                page_report_file = f"{sanitized_file_name}_SDG-Page_report.docx"
                primary_page_jpeg = f"{sanitized_file_name}_SDG-Page_primary_graph.jpeg"

                page_csv_file_secondary = f"{sanitized_file_name}_SDG-Page_predictions.csv"
                page_report_file_secondary = f"{sanitized_file_name}_SDG-Page_report.docx"
                secondary_page_jpeg = f"{sanitized_file_name}_SDG-Page_secondary_graph.jpeg"

                # Save CSV and reports
                df_page_predictions.to_csv(page_csv_file, index=False)
                page_report_primary = generate_page_report(df_page_predictions, page_report_file)

                df_page_predictions.to_csv(page_csv_file_secondary, index=False)
                page_report_secondary = generate_page_report(df_page_predictions, page_report_file_secondary)

                # Save figures as JPEG
                save_figure_as_jpeg(first_plot, primary_page_jpeg)
                save_figure_as_jpeg(second_plot, secondary_page_jpeg)

                return (
                    first_plot, second_plot,
                    page_csv_file, page_report_file, primary_page_jpeg,
                    page_csv_file_secondary, page_report_file_secondary, secondary_page_jpeg
                )

            except Exception as e:
                print(f"Error: {e}")
                return [None, None, None, None, None, None, None, None]

        # Function to process sentence-level analysis
        @spaces.GPU
        def process_sentences(file, extraction_mode, start_page, end_page):
            if not file:
                # Return None for each output component
                return [None, None, None, None, None, None, None, None]

            try:
                if hasattr(file, 'name'):
                    pdf_file_path = file.name
                    original_file_name = os.path.basename(file.name)
                else:
                    # Save the file to a temporary location
                    with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_pdf:
                        temp_pdf.write(file.read())
                        pdf_file_path = temp_pdf.name
                    original_file_name = 'uploaded_document'

                # Sanitize the file name to use in output file names
                sanitized_file_name = os.path.splitext(original_file_name)[0]
                sanitized_file_name = re.sub(r'[^\w\-]', '_', sanitized_file_name)

                # Determine page range based on extraction_mode
                if extraction_mode == "All Pages":
                    selected_start = None
                    selected_end = None
                else:
                    selected_start = int(start_page)
                    selected_end = int(end_page)

                # Extract text and create DataFrames
                _, sentence_df = extract_text_with_py_pdf_loader(
                    pdf_file_path,
                    start_page=selected_start,
                    end_page=selected_end
                )

                # Predict SDGs at sentence level
                df_sentence_predictions = predict_sentences(sentence_df)

                # Generate plots with icon overlay
                first_plot = plot_sdg(
                    df_sentence_predictions, "",
                    'pred1',
                    x_axis_title="Percentage (%) of aligned sentences",
                    y_axis_title="Primary SDGs"
                )
                second_plot = plot_sdg(
                    df_sentence_predictions, "",
                    'pred2',
                    x_axis_title="Percentage (%) of aligned sentences",
                    y_axis_title="Secondary SDGs"
                )

                # Define output file names
                sentence_csv_file = f"{sanitized_file_name}_SDG-Sentence_predictions.csv"
                sentence_report_file = f"{sanitized_file_name}_SDG-Sentence_report.docx"
                primary_sentence_jpeg = f"{sanitized_file_name}_SDG-Sentence_primary_graph.jpeg"

                sentence_csv_file_secondary = f"{sanitized_file_name}_SDG-Sentence_predictions.csv"
                sentence_report_file_secondary = f"{sanitized_file_name}_SDG-Sentence_report.docx"
                secondary_sentence_jpeg = f"{sanitized_file_name}_SDG-Sentence_secondary_graph.jpeg"

                # Save CSV and reports
                df_sentence_predictions.to_csv(sentence_csv_file, index=False)
                sentence_report_primary = generate_sentence_report(df_sentence_predictions, sentence_report_file)

                df_sentence_predictions.to_csv(sentence_csv_file_secondary, index=False)
                sentence_report_secondary = generate_sentence_report(df_sentence_predictions, sentence_report_file_secondary)

                # Save figures as JPEG
                save_figure_as_jpeg(first_plot, primary_sentence_jpeg)
                save_figure_as_jpeg(second_plot, secondary_sentence_jpeg)

                return (
                    first_plot, second_plot,
                    sentence_csv_file, sentence_report_file, primary_sentence_jpeg,
                    sentence_csv_file_secondary, sentence_report_file_secondary, secondary_sentence_jpeg
                )

            except Exception as e:
                print(f"Error: {e}")
                return [None, None, None, None, None, None, None, None]

        # Reset functions to clear the outputs
        def reset_page_outputs():
            return [None, None, None, None, None, None, None, None]

        def reset_sentence_outputs():
            return [None, None, None, None, None, None, None, None]

        # Button actions for Page-Level Analysis
        page_button.click(
            process_pages,
            inputs=[file_input, extraction_mode, start_page, end_page],
            outputs=[
                primary_page_plot,           # πŸ“Š Primary SDGs [Page-Level]
                secondary_page_plot,         # πŸ“ˆ Secondary SDGs [Page-Level]
                page_csv,                    # πŸ“Š Download Page Predictions CSV
                page_docx,                   # πŸ“„ Download Page Report DOCX
                page_jpeg1,                  # πŸ–ΌοΈ Download Primary SDGs JPEG
                page_csv_secondary,          # πŸ“Š Download Page Predictions CSV
                page_report_file_secondary,  # πŸ“„ Download Page Report DOCX
                secondary_page_jpeg          # πŸ–ΌοΈ Download Secondary SDGs JPEG
            ]
        )

        reset_page_button.click(
            reset_page_outputs,
            outputs=[
                primary_page_plot,
                secondary_page_plot,
                page_csv,
                page_docx,
                page_jpeg1,
                page_csv_secondary,
                page_report_file_secondary,
                secondary_page_jpeg
            ]
        )

        # Button actions for Sentence-Level Analysis
        sentence_button.click(
            process_sentences,
            inputs=[file_input, extraction_mode, start_page, end_page],
            outputs=[
                primary_sentence_plot,           # πŸ“Š Primary SDGs [Sentence-Level]
                secondary_sentence_plot,         # πŸ“ˆ Secondary SDGs [Sentence-Level]
                sentence_csv,                    # πŸ“Š Download Sentence Predictions CSV
                sentence_docx,                   # πŸ“„ Download Sentence Report DOCX
                sentence_jpeg1,                  # πŸ–ΌοΈ Download Primary SDGs JPEG
                sentence_csv_secondary,          # πŸ“Š Download Sentence Predictions CSV
                sentence_report_file_secondary,  # πŸ“„ Download Sentence Report DOCX
                secondary_sentence_jpeg          # πŸ–ΌοΈ Download Secondary SDGs JPEG
            ]
        )

        reset_sentence_button.click(
            reset_sentence_outputs,
            outputs=[
                primary_sentence_plot,
                secondary_sentence_plot,
                sentence_csv,
                sentence_docx,
                sentence_jpeg1,
                sentence_csv_secondary,
                sentence_report_file_secondary,
                secondary_sentence_jpeg
            ]
        )

    demo.queue().launch()

launch_interface()