sadickam commited on
Commit
00b2fb8
·
verified ·
1 Parent(s): c5beecc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +46 -16
app.py CHANGED
@@ -25,6 +25,18 @@ from langchain_community.document_loaders import PyPDFLoader
25
  # Model checkpoint for SDG BERT
26
  checkpoint = "sadickam/sdgBERT"
27
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  # Preprocessing function for text
29
  def prep_text(text):
30
  clean_sents = []
@@ -74,7 +86,8 @@ def predict_pages(page_df, batch_size=32):
74
  for start in range(0, num_rows, batch_size):
75
  end = min(start + batch_size, num_rows)
76
  df_chunk = page_df.iloc[start:end]
77
- texts = df_chunk['Text'].apply(prep_text).tolist()
 
78
  predictions_batch = predict_sdg_labels_batch(texts, model, tokenizer)
79
  for predictions in predictions_batch:
80
  sorted_preds = sorted(zip(label_list, predictions), key=lambda x: x[1], reverse=True)
@@ -109,7 +122,8 @@ def predict_sentences(sentence_df, batch_size=32):
109
  for start in range(0, num_rows, batch_size):
110
  end = min(start + batch_size, num_rows)
111
  df_chunk = sentence_df.iloc[start:end]
112
- texts = df_chunk['Sentence'].apply(prep_text).tolist()
 
113
  predictions_batch = predict_sdg_labels_batch(texts, model, tokenizer)
114
  for predictions in predictions_batch:
115
  sorted_preds = sorted(zip(label_list, predictions), key=lambda x: x[1], reverse=True)
@@ -310,7 +324,7 @@ def generate_sentence_report(df_sentences):
310
  doc.save("sentence_report.docx")
311
  return "sentence_report.docx"
312
 
313
- # New text extraction functions
314
  def extract_text_with_py_pdf_loader(pdf_file_path, start_page=None, end_page=None):
315
  """
316
  Extract text from a PDF page by page using LangChain's PyPDFLoader.
@@ -360,15 +374,22 @@ def extract_text_with_py_pdf_loader(pdf_file_path, start_page=None, end_page=Non
360
  page_num = idx
361
  text = doc.page_content.strip()
362
 
 
 
 
 
 
 
 
363
  # Append page-wise data
364
  page_data.append({
365
  "Document": doc_name,
366
  "Page": page_num,
367
- "Text": text
368
  })
369
 
370
  # Sentence tokenization
371
- sentences = sent_tokenize(text)
372
  for sentence in sentences:
373
  sentence = sentence.strip()
374
  if sentence:
@@ -407,10 +428,10 @@ def df_to_csv_bytes(df):
407
  def launch_interface():
408
  with gr.Blocks(title="SDG Document Analysis App") as demo:
409
 
410
- # Title as a visible heading at the top of the page
411
  gr.Markdown(
412
  """
413
- # SDG Document Analysis App
414
  Analyze documents to map Sustainable Development Goals (SDGs) at both page and sentence levels.
415
  """
416
  )
@@ -421,14 +442,23 @@ def launch_interface():
421
  label="Upload PDF File for Analysis", file_types=[".pdf"]
422
  )
423
 
424
- # Extraction mode selection
425
- extraction_mode = gr.Radio(
426
- choices=["All Pages", "Range of Pages"],
427
- value="All Pages",
428
- label="Extraction Mode"
 
429
  )
430
- start_page = gr.Number(value=1, label="Start Page", visible=False)
431
- end_page = gr.Number(value=1, label="End Page", visible=False)
 
 
 
 
 
 
 
 
432
 
433
  # Function to update visibility of start_page and end_page
434
  def update_page_inputs(extraction_mode):
@@ -447,7 +477,7 @@ def launch_interface():
447
  with gr.Tab("Page-Level Analysis"):
448
  gr.Markdown(
449
  """
450
- ## Page-Level SDG Analysis
451
  This section conducts Sustainable Development Goals (SDG) mapping
452
  of documents using the [sdgBERT model](https://huggingface.co/sadickam/sdgBERT).
453
  It provides **high-level SDG mapping** of documents at the page level.
@@ -471,7 +501,7 @@ def launch_interface():
471
  with gr.Tab("Sentence-Level Analysis"):
472
  gr.Markdown(
473
  """
474
- ## Sentence-Level SDG Analysis
475
  This section conducts Sustainable Development Goals (SDG) mapping
476
  using the [sdgBERT model](https://huggingface.co/sadickam/sdgBERT).
477
  It provides **detailed SDG mapping** at the sentence level.
 
25
  # Model checkpoint for SDG BERT
26
  checkpoint = "sadickam/sdgBERT"
27
 
28
+ # Text cleaning function
29
+ def clean_text(text):
30
+ """
31
+ Cleans the extracted text by removing irrelevant characters but retains currency symbols.
32
+ """
33
+ text = text.strip()
34
+ # Define the allowed characters (including currency symbols)
35
+ allowed_chars = r'[^a-zA-Z0-9\s\.,!?$€£¥₹¢₩]'
36
+ text = re.sub(allowed_chars, '', text)
37
+ text = re.sub(r'\s+', ' ', text) # Replace multiple spaces with a single space
38
+ return text
39
+
40
  # Preprocessing function for text
41
  def prep_text(text):
42
  clean_sents = []
 
86
  for start in range(0, num_rows, batch_size):
87
  end = min(start + batch_size, num_rows)
88
  df_chunk = page_df.iloc[start:end]
89
+ # Clean text
90
+ texts = df_chunk['Text'].apply(clean_text).apply(prep_text).tolist()
91
  predictions_batch = predict_sdg_labels_batch(texts, model, tokenizer)
92
  for predictions in predictions_batch:
93
  sorted_preds = sorted(zip(label_list, predictions), key=lambda x: x[1], reverse=True)
 
122
  for start in range(0, num_rows, batch_size):
123
  end = min(start + batch_size, num_rows)
124
  df_chunk = sentence_df.iloc[start:end]
125
+ # Clean text
126
+ texts = df_chunk['Sentence'].apply(clean_text).apply(prep_text).tolist()
127
  predictions_batch = predict_sdg_labels_batch(texts, model, tokenizer)
128
  for predictions in predictions_batch:
129
  sorted_preds = sorted(zip(label_list, predictions), key=lambda x: x[1], reverse=True)
 
324
  doc.save("sentence_report.docx")
325
  return "sentence_report.docx"
326
 
327
+ # New text extraction functions with text cleaning and line joining
328
  def extract_text_with_py_pdf_loader(pdf_file_path, start_page=None, end_page=None):
329
  """
330
  Extract text from a PDF page by page using LangChain's PyPDFLoader.
 
374
  page_num = idx
375
  text = doc.page_content.strip()
376
 
377
+ # Join lines that belong to the same sentence
378
+ lines = text.split('\n')
379
+ joined_text = ' '.join(line.strip() for line in lines if line.strip())
380
+
381
+ # Clean text
382
+ cleaned_text = clean_text(joined_text)
383
+
384
  # Append page-wise data
385
  page_data.append({
386
  "Document": doc_name,
387
  "Page": page_num,
388
+ "Text": cleaned_text
389
  })
390
 
391
  # Sentence tokenization
392
+ sentences = sent_tokenize(cleaned_text)
393
  for sentence in sentences:
394
  sentence = sentence.strip()
395
  if sentence:
 
428
  def launch_interface():
429
  with gr.Blocks(title="SDG Document Analysis App") as demo:
430
 
431
+ # Title as a visible heading at the top of the page with an icon
432
  gr.Markdown(
433
  """
434
+ # 🌍 SDG Document Analysis App
435
  Analyze documents to map Sustainable Development Goals (SDGs) at both page and sentence levels.
436
  """
437
  )
 
442
  label="Upload PDF File for Analysis", file_types=[".pdf"]
443
  )
444
 
445
+ # Extraction mode selection with explanatory text
446
+ gr.Markdown(
447
+ """
448
+ ## Extraction Mode
449
+ Choose whether to analyze all pages or a specific range of pages. If you want to exclude certain pages from the analysis, select "Range of Pages" and specify the start and end pages.
450
+ """
451
  )
452
+ with gr.Row():
453
+ extraction_mode = gr.Radio(
454
+ choices=["All Pages", "Range of Pages"],
455
+ value="All Pages",
456
+ label="Extraction Mode"
457
+ )
458
+
459
+ with gr.Row():
460
+ start_page = gr.Number(value=1, label="Start Page", visible=False)
461
+ end_page = gr.Number(value=1, label="End Page", visible=False)
462
 
463
  # Function to update visibility of start_page and end_page
464
  def update_page_inputs(extraction_mode):
 
477
  with gr.Tab("Page-Level Analysis"):
478
  gr.Markdown(
479
  """
480
+ ## 📄 Page-Level SDG Analysis
481
  This section conducts Sustainable Development Goals (SDG) mapping
482
  of documents using the [sdgBERT model](https://huggingface.co/sadickam/sdgBERT).
483
  It provides **high-level SDG mapping** of documents at the page level.
 
501
  with gr.Tab("Sentence-Level Analysis"):
502
  gr.Markdown(
503
  """
504
+ ## ✍️ Sentence-Level SDG Analysis
505
  This section conducts Sustainable Development Goals (SDG) mapping
506
  using the [sdgBERT model](https://huggingface.co/sadickam/sdgBERT).
507
  It provides **detailed SDG mapping** at the sentence level.