Epitomea-demo-V2

Runtime error

App Files Files Community

kpal002 commited on Feb 6

Commit

6ce5fee

•

1 Parent(s): 8460e38

Update RAG_utils.py

Browse files

Files changed (1) hide show

RAG_utils.py +30 -2

RAG_utils.py CHANGED Viewed

@@ -11,6 +11,7 @@ import pandas as pd
 import numpy as np
 import evaluate
 import qdrant_client
 from pydantic import BaseModel, Field
 from typing import Any, List, Tuple, Set, Dict, Optional, Union
 from sklearn.metrics.pairwise import cosine_similarity
@@ -385,6 +386,32 @@ class PDFProcessor_Unstructured:
         return (current_chunk.endswith(",") or
                 (current_chunk[-1].islower() and next_chunk[0].islower()))
     def process_pdf(self) -> Tuple[List[str], List[str]]:
         """
         Processes the PDF by extracting, categorizing, and merging elements.
@@ -430,9 +457,10 @@ class PDFProcessor_Unstructured:
         try:
             logging.debug(f"Processing PDF at {self.file_path}")
-            results = self.process_pdf()  # Assuming this is a defined method
             logging.info("PDF processing completed successfully.")
-            return results
         except Exception as e:
             logging.error(f"Error processing PDF file: {e}", exc_info=True)
             raise

 import numpy as np
 import evaluate
 import qdrant_client
+from pypdf import PdfReader
 from pydantic import BaseModel, Field
 from typing import Any, List, Tuple, Set, Dict, Optional, Union
 from sklearn.metrics.pairwise import cosine_similarity
         return (current_chunk.endswith(",") or
                 (current_chunk[-1].islower() and next_chunk[0].islower()))
+    def extract_title_from_pdf(self, uploaded_file):
+        """
+        Extracts the title from a PDF file's metadata.
+        This function reads the metadata of a PDF file using PyPDF2 and attempts to
+        extract the title. If the title is present in the metadata, it is returned.
+        Otherwise, a default message indicating that the title was not found is returned.
+        Parameters:
+        uploaded_file (file): A file object or a path to the PDF file from which
+                          to extract the title. The file must be opened in binary mode.
+        Returns:
+        str: The title of the PDF file as a string. If no title is found, returns
+             'Title not found'.
+        """
+        # Initialize PDF reader
+        pdf_reader = PdfFileReader(uploaded_file)
+        # Extract document information
+        meta = pdf_reader.getDocumentInfo()
+        # Retrieve title from document information
+        title = meta.title if meta and meta.title else 'Title not found'
+        return title
     def process_pdf(self) -> Tuple[List[str], List[str]]:
         """
         Processes the PDF by extracting, categorizing, and merging elements.
         try:
             logging.debug(f"Processing PDF at {self.file_path}")
+            results = self.process_pdf()
+            title = extract_title_from_pdf(self.file_path)
             logging.info("PDF processing completed successfully.")
+            return results, title
         except Exception as e:
             logging.error(f"Error processing PDF file: {e}", exc_info=True)
             raise