document-SDG-App-cpu

Running

App Files Files Community

sadickam commited on Oct 16, 2024

Commit

00b2fb8

verified ·

1 Parent(s): c5beecc

Update app.py

Browse files

Files changed (1) hide show

app.py +46 -16

app.py CHANGED Viewed

@@ -25,6 +25,18 @@ from langchain_community.document_loaders import PyPDFLoader
 # Model checkpoint for SDG BERT
 checkpoint = "sadickam/sdgBERT"
 # Preprocessing function for text
 def prep_text(text):
     clean_sents = []
@@ -74,7 +86,8 @@ def predict_pages(page_df, batch_size=32):
     for start in range(0, num_rows, batch_size):
         end = min(start + batch_size, num_rows)
         df_chunk = page_df.iloc[start:end]
-        texts = df_chunk['Text'].apply(prep_text).tolist()
         predictions_batch = predict_sdg_labels_batch(texts, model, tokenizer)
         for predictions in predictions_batch:
             sorted_preds = sorted(zip(label_list, predictions), key=lambda x: x[1], reverse=True)
@@ -109,7 +122,8 @@ def predict_sentences(sentence_df, batch_size=32):
     for start in range(0, num_rows, batch_size):
         end = min(start + batch_size, num_rows)
         df_chunk = sentence_df.iloc[start:end]
-        texts = df_chunk['Sentence'].apply(prep_text).tolist()
         predictions_batch = predict_sdg_labels_batch(texts, model, tokenizer)
         for predictions in predictions_batch:
             sorted_preds = sorted(zip(label_list, predictions), key=lambda x: x[1], reverse=True)
@@ -310,7 +324,7 @@ def generate_sentence_report(df_sentences):
     doc.save("sentence_report.docx")
     return "sentence_report.docx"
-# New text extraction functions
 def extract_text_with_py_pdf_loader(pdf_file_path, start_page=None, end_page=None):
     """
     Extract text from a PDF page by page using LangChain's PyPDFLoader.
@@ -360,15 +374,22 @@ def extract_text_with_py_pdf_loader(pdf_file_path, start_page=None, end_page=Non
             page_num = idx
             text = doc.page_content.strip()
             # Append page-wise data
             page_data.append({
                 "Document": doc_name,
                 "Page": page_num,
-                "Text": text
             })
             # Sentence tokenization
-            sentences = sent_tokenize(text)
             for sentence in sentences:
                 sentence = sentence.strip()
                 if sentence:
@@ -407,10 +428,10 @@ def df_to_csv_bytes(df):
 def launch_interface():
     with gr.Blocks(title="SDG Document Analysis App") as demo:
-        # Title as a visible heading at the top of the page
         gr.Markdown(
             """
-            # SDG Document Analysis App
             Analyze documents to map Sustainable Development Goals (SDGs) at both page and sentence levels.
             """
         )
@@ -421,14 +442,23 @@ def launch_interface():
                 label="Upload PDF File for Analysis", file_types=[".pdf"]
             )
-        # Extraction mode selection
-        extraction_mode = gr.Radio(
-            choices=["All Pages", "Range of Pages"],
-            value="All Pages",
-            label="Extraction Mode"
         )
-        start_page = gr.Number(value=1, label="Start Page", visible=False)
-        end_page = gr.Number(value=1, label="End Page", visible=False)
         # Function to update visibility of start_page and end_page
         def update_page_inputs(extraction_mode):
@@ -447,7 +477,7 @@ def launch_interface():
         with gr.Tab("Page-Level Analysis"):
             gr.Markdown(
                 """
-                ## Page-Level SDG Analysis
                 This section conducts Sustainable Development Goals (SDG) mapping
                 of documents using the [sdgBERT model](https://huggingface.co/sadickam/sdgBERT).
                 It provides **high-level SDG mapping** of documents at the page level.
@@ -471,7 +501,7 @@ def launch_interface():
         with gr.Tab("Sentence-Level Analysis"):
             gr.Markdown(
                 """
-                ## Sentence-Level SDG Analysis
                 This section conducts Sustainable Development Goals (SDG) mapping
                 using the [sdgBERT model](https://huggingface.co/sadickam/sdgBERT).
                 It provides **detailed SDG mapping** at the sentence level.

 # Model checkpoint for SDG BERT
 checkpoint = "sadickam/sdgBERT"
+# Text cleaning function
+def clean_text(text):
+    """
+    Cleans the extracted text by removing irrelevant characters but retains currency symbols.
+    """
+    text = text.strip()
+    # Define the allowed characters (including currency symbols)
+    allowed_chars = r'[^a-zA-Z0-9\s\.,!?$€£¥₹¢₩]'
+    text = re.sub(allowed_chars, '', text)
+    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
+    return text
 # Preprocessing function for text
 def prep_text(text):
     clean_sents = []
     for start in range(0, num_rows, batch_size):
         end = min(start + batch_size, num_rows)
         df_chunk = page_df.iloc[start:end]
+        # Clean text
+        texts = df_chunk['Text'].apply(clean_text).apply(prep_text).tolist()
         predictions_batch = predict_sdg_labels_batch(texts, model, tokenizer)
         for predictions in predictions_batch:
             sorted_preds = sorted(zip(label_list, predictions), key=lambda x: x[1], reverse=True)
     for start in range(0, num_rows, batch_size):
         end = min(start + batch_size, num_rows)
         df_chunk = sentence_df.iloc[start:end]
+        # Clean text
+        texts = df_chunk['Sentence'].apply(clean_text).apply(prep_text).tolist()
         predictions_batch = predict_sdg_labels_batch(texts, model, tokenizer)
         for predictions in predictions_batch:
             sorted_preds = sorted(zip(label_list, predictions), key=lambda x: x[1], reverse=True)
     doc.save("sentence_report.docx")
     return "sentence_report.docx"
+# New text extraction functions with text cleaning and line joining
 def extract_text_with_py_pdf_loader(pdf_file_path, start_page=None, end_page=None):
     """
     Extract text from a PDF page by page using LangChain's PyPDFLoader.
             page_num = idx
             text = doc.page_content.strip()
+            # Join lines that belong to the same sentence
+            lines = text.split('\n')
+            joined_text = ' '.join(line.strip() for line in lines if line.strip())
+            # Clean text
+            cleaned_text = clean_text(joined_text)
             # Append page-wise data
             page_data.append({
                 "Document": doc_name,
                 "Page": page_num,
+                "Text": cleaned_text
             })
             # Sentence tokenization
+            sentences = sent_tokenize(cleaned_text)
             for sentence in sentences:
                 sentence = sentence.strip()
                 if sentence:
 def launch_interface():
     with gr.Blocks(title="SDG Document Analysis App") as demo:
+        # Title as a visible heading at the top of the page with an icon
         gr.Markdown(
             """
+            # 🌍 SDG Document Analysis App
             Analyze documents to map Sustainable Development Goals (SDGs) at both page and sentence levels.
             """
         )
                 label="Upload PDF File for Analysis", file_types=[".pdf"]
             )
+        # Extraction mode selection with explanatory text
+        gr.Markdown(
+            """
+            ## Extraction Mode
+            Choose whether to analyze all pages or a specific range of pages. If you want to exclude certain pages from the analysis, select "Range of Pages" and specify the start and end pages.
+            """
         )
+        with gr.Row():
+            extraction_mode = gr.Radio(
+                choices=["All Pages", "Range of Pages"],
+                value="All Pages",
+                label="Extraction Mode"
+            )
+        with gr.Row():
+            start_page = gr.Number(value=1, label="Start Page", visible=False)
+            end_page = gr.Number(value=1, label="End Page", visible=False)
         # Function to update visibility of start_page and end_page
         def update_page_inputs(extraction_mode):
         with gr.Tab("Page-Level Analysis"):
             gr.Markdown(
                 """
+                ## 📄 Page-Level SDG Analysis
                 This section conducts Sustainable Development Goals (SDG) mapping
                 of documents using the [sdgBERT model](https://huggingface.co/sadickam/sdgBERT).
                 It provides **high-level SDG mapping** of documents at the page level.
         with gr.Tab("Sentence-Level Analysis"):
             gr.Markdown(
                 """
+                ## ✍️ Sentence-Level SDG Analysis
                 This section conducts Sustainable Development Goals (SDG) mapping
                 using the [sdgBERT model](https://huggingface.co/sadickam/sdgBERT).
                 It provides **detailed SDG mapping** at the sentence level.