Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -25,6 +25,18 @@ from langchain_community.document_loaders import PyPDFLoader
|
|
25 |
# Model checkpoint for SDG BERT
|
26 |
checkpoint = "sadickam/sdgBERT"
|
27 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
# Preprocessing function for text
|
29 |
def prep_text(text):
|
30 |
clean_sents = []
|
@@ -74,7 +86,8 @@ def predict_pages(page_df, batch_size=32):
|
|
74 |
for start in range(0, num_rows, batch_size):
|
75 |
end = min(start + batch_size, num_rows)
|
76 |
df_chunk = page_df.iloc[start:end]
|
77 |
-
|
|
|
78 |
predictions_batch = predict_sdg_labels_batch(texts, model, tokenizer)
|
79 |
for predictions in predictions_batch:
|
80 |
sorted_preds = sorted(zip(label_list, predictions), key=lambda x: x[1], reverse=True)
|
@@ -109,7 +122,8 @@ def predict_sentences(sentence_df, batch_size=32):
|
|
109 |
for start in range(0, num_rows, batch_size):
|
110 |
end = min(start + batch_size, num_rows)
|
111 |
df_chunk = sentence_df.iloc[start:end]
|
112 |
-
|
|
|
113 |
predictions_batch = predict_sdg_labels_batch(texts, model, tokenizer)
|
114 |
for predictions in predictions_batch:
|
115 |
sorted_preds = sorted(zip(label_list, predictions), key=lambda x: x[1], reverse=True)
|
@@ -310,7 +324,7 @@ def generate_sentence_report(df_sentences):
|
|
310 |
doc.save("sentence_report.docx")
|
311 |
return "sentence_report.docx"
|
312 |
|
313 |
-
# New text extraction functions
|
314 |
def extract_text_with_py_pdf_loader(pdf_file_path, start_page=None, end_page=None):
|
315 |
"""
|
316 |
Extract text from a PDF page by page using LangChain's PyPDFLoader.
|
@@ -360,15 +374,22 @@ def extract_text_with_py_pdf_loader(pdf_file_path, start_page=None, end_page=Non
|
|
360 |
page_num = idx
|
361 |
text = doc.page_content.strip()
|
362 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
363 |
# Append page-wise data
|
364 |
page_data.append({
|
365 |
"Document": doc_name,
|
366 |
"Page": page_num,
|
367 |
-
"Text":
|
368 |
})
|
369 |
|
370 |
# Sentence tokenization
|
371 |
-
sentences = sent_tokenize(
|
372 |
for sentence in sentences:
|
373 |
sentence = sentence.strip()
|
374 |
if sentence:
|
@@ -407,10 +428,10 @@ def df_to_csv_bytes(df):
|
|
407 |
def launch_interface():
|
408 |
with gr.Blocks(title="SDG Document Analysis App") as demo:
|
409 |
|
410 |
-
# Title as a visible heading at the top of the page
|
411 |
gr.Markdown(
|
412 |
"""
|
413 |
-
# SDG Document Analysis App
|
414 |
Analyze documents to map Sustainable Development Goals (SDGs) at both page and sentence levels.
|
415 |
"""
|
416 |
)
|
@@ -421,14 +442,23 @@ def launch_interface():
|
|
421 |
label="Upload PDF File for Analysis", file_types=[".pdf"]
|
422 |
)
|
423 |
|
424 |
-
# Extraction mode selection
|
425 |
-
|
426 |
-
|
427 |
-
|
428 |
-
|
|
|
429 |
)
|
430 |
-
|
431 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
432 |
|
433 |
# Function to update visibility of start_page and end_page
|
434 |
def update_page_inputs(extraction_mode):
|
@@ -447,7 +477,7 @@ def launch_interface():
|
|
447 |
with gr.Tab("Page-Level Analysis"):
|
448 |
gr.Markdown(
|
449 |
"""
|
450 |
-
## Page-Level SDG Analysis
|
451 |
This section conducts Sustainable Development Goals (SDG) mapping
|
452 |
of documents using the [sdgBERT model](https://huggingface.co/sadickam/sdgBERT).
|
453 |
It provides **high-level SDG mapping** of documents at the page level.
|
@@ -471,7 +501,7 @@ def launch_interface():
|
|
471 |
with gr.Tab("Sentence-Level Analysis"):
|
472 |
gr.Markdown(
|
473 |
"""
|
474 |
-
## Sentence-Level SDG Analysis
|
475 |
This section conducts Sustainable Development Goals (SDG) mapping
|
476 |
using the [sdgBERT model](https://huggingface.co/sadickam/sdgBERT).
|
477 |
It provides **detailed SDG mapping** at the sentence level.
|
|
|
25 |
# Model checkpoint for SDG BERT
|
26 |
checkpoint = "sadickam/sdgBERT"
|
27 |
|
28 |
+
# Text cleaning function
|
29 |
+
def clean_text(text):
|
30 |
+
"""
|
31 |
+
Cleans the extracted text by removing irrelevant characters but retains currency symbols.
|
32 |
+
"""
|
33 |
+
text = text.strip()
|
34 |
+
# Define the allowed characters (including currency symbols)
|
35 |
+
allowed_chars = r'[^a-zA-Z0-9\s\.,!?$€£¥₹¢₩]'
|
36 |
+
text = re.sub(allowed_chars, '', text)
|
37 |
+
text = re.sub(r'\s+', ' ', text) # Replace multiple spaces with a single space
|
38 |
+
return text
|
39 |
+
|
40 |
# Preprocessing function for text
|
41 |
def prep_text(text):
|
42 |
clean_sents = []
|
|
|
86 |
for start in range(0, num_rows, batch_size):
|
87 |
end = min(start + batch_size, num_rows)
|
88 |
df_chunk = page_df.iloc[start:end]
|
89 |
+
# Clean text
|
90 |
+
texts = df_chunk['Text'].apply(clean_text).apply(prep_text).tolist()
|
91 |
predictions_batch = predict_sdg_labels_batch(texts, model, tokenizer)
|
92 |
for predictions in predictions_batch:
|
93 |
sorted_preds = sorted(zip(label_list, predictions), key=lambda x: x[1], reverse=True)
|
|
|
122 |
for start in range(0, num_rows, batch_size):
|
123 |
end = min(start + batch_size, num_rows)
|
124 |
df_chunk = sentence_df.iloc[start:end]
|
125 |
+
# Clean text
|
126 |
+
texts = df_chunk['Sentence'].apply(clean_text).apply(prep_text).tolist()
|
127 |
predictions_batch = predict_sdg_labels_batch(texts, model, tokenizer)
|
128 |
for predictions in predictions_batch:
|
129 |
sorted_preds = sorted(zip(label_list, predictions), key=lambda x: x[1], reverse=True)
|
|
|
324 |
doc.save("sentence_report.docx")
|
325 |
return "sentence_report.docx"
|
326 |
|
327 |
+
# New text extraction functions with text cleaning and line joining
|
328 |
def extract_text_with_py_pdf_loader(pdf_file_path, start_page=None, end_page=None):
|
329 |
"""
|
330 |
Extract text from a PDF page by page using LangChain's PyPDFLoader.
|
|
|
374 |
page_num = idx
|
375 |
text = doc.page_content.strip()
|
376 |
|
377 |
+
# Join lines that belong to the same sentence
|
378 |
+
lines = text.split('\n')
|
379 |
+
joined_text = ' '.join(line.strip() for line in lines if line.strip())
|
380 |
+
|
381 |
+
# Clean text
|
382 |
+
cleaned_text = clean_text(joined_text)
|
383 |
+
|
384 |
# Append page-wise data
|
385 |
page_data.append({
|
386 |
"Document": doc_name,
|
387 |
"Page": page_num,
|
388 |
+
"Text": cleaned_text
|
389 |
})
|
390 |
|
391 |
# Sentence tokenization
|
392 |
+
sentences = sent_tokenize(cleaned_text)
|
393 |
for sentence in sentences:
|
394 |
sentence = sentence.strip()
|
395 |
if sentence:
|
|
|
428 |
def launch_interface():
|
429 |
with gr.Blocks(title="SDG Document Analysis App") as demo:
|
430 |
|
431 |
+
# Title as a visible heading at the top of the page with an icon
|
432 |
gr.Markdown(
|
433 |
"""
|
434 |
+
# 🌍 SDG Document Analysis App
|
435 |
Analyze documents to map Sustainable Development Goals (SDGs) at both page and sentence levels.
|
436 |
"""
|
437 |
)
|
|
|
442 |
label="Upload PDF File for Analysis", file_types=[".pdf"]
|
443 |
)
|
444 |
|
445 |
+
# Extraction mode selection with explanatory text
|
446 |
+
gr.Markdown(
|
447 |
+
"""
|
448 |
+
## Extraction Mode
|
449 |
+
Choose whether to analyze all pages or a specific range of pages. If you want to exclude certain pages from the analysis, select "Range of Pages" and specify the start and end pages.
|
450 |
+
"""
|
451 |
)
|
452 |
+
with gr.Row():
|
453 |
+
extraction_mode = gr.Radio(
|
454 |
+
choices=["All Pages", "Range of Pages"],
|
455 |
+
value="All Pages",
|
456 |
+
label="Extraction Mode"
|
457 |
+
)
|
458 |
+
|
459 |
+
with gr.Row():
|
460 |
+
start_page = gr.Number(value=1, label="Start Page", visible=False)
|
461 |
+
end_page = gr.Number(value=1, label="End Page", visible=False)
|
462 |
|
463 |
# Function to update visibility of start_page and end_page
|
464 |
def update_page_inputs(extraction_mode):
|
|
|
477 |
with gr.Tab("Page-Level Analysis"):
|
478 |
gr.Markdown(
|
479 |
"""
|
480 |
+
## 📄 Page-Level SDG Analysis
|
481 |
This section conducts Sustainable Development Goals (SDG) mapping
|
482 |
of documents using the [sdgBERT model](https://huggingface.co/sadickam/sdgBERT).
|
483 |
It provides **high-level SDG mapping** of documents at the page level.
|
|
|
501 |
with gr.Tab("Sentence-Level Analysis"):
|
502 |
gr.Markdown(
|
503 |
"""
|
504 |
+
## ✍️ Sentence-Level SDG Analysis
|
505 |
This section conducts Sustainable Development Goals (SDG) mapping
|
506 |
using the [sdgBERT model](https://huggingface.co/sadickam/sdgBERT).
|
507 |
It provides **detailed SDG mapping** at the sentence level.
|