File size: 5,264 Bytes
8d2f9d4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
# app/crud.py
from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader, CSVLoader, UnstructuredExcelLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sqlalchemy.future import select
from sqlalchemy.ext.asyncio import AsyncSession
from markitdown import MarkItDown
import os
import logging



from typing import List, Optional
# from app.db.models.docs import *
# from app.schemas.schemas import DocumentCreate, DocumentUpdate

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

async def load_file_with_markitdown(file_path:str, llm_client:str=None, model:str=None):
    
    if llm_client and model:
        markitdown = MarkItDown(llm_client, model)
        documents = markitdown.convert(file_path)
    else:
        markitdown = MarkItDown()
        documents = markitdown.convert(file_path)

    return documents



async def load_pdf_with_langchain(file_path):
    """
    Loads and extracts text from a PDF file using LangChain's PyPDFLoader.

    Parameters:
        file_path (str): Path to the PDF file.

    Returns:
        List[Document]: A list of LangChain Document objects with metadata.
    """

    loader = PyPDFLoader(file_path, extract_images=True)
   
    documents = loader.load()

    return documents  # Returns a list of Document objects

async def load_file_with_langchain(file_path: str):
    """
    Loads and extracts text from a PDF or DOCX file using LangChain's appropriate loader.

    Parameters:
        file_path (str): Path to the file (PDF or DOCX).

    Returns:
        List[Document]: A list of LangChain Document objects with metadata.
    """
    # Determine the file extension
    _, file_extension = os.path.splitext(file_path)
    
    # Choose the loader based on file extension
    if file_extension.lower() == '.pdf':
        loader = PyPDFLoader(file_path)
    elif file_extension.lower() == '.docx':
        loader = Docx2txtLoader(file_path)
    elif file_extension.lower() == '.csv':
        loader = CSVLoader(file_path)
    elif file_extension.lower() == '.xlsx':
        loader = UnstructuredExcelLoader(file_path)
    else:
        raise ValueError("Unsupported file format. Please provide a PDF or DOCX file.")
    
    # Load the documents
    documents = loader.load()

    return documents

async def split_documents(documents, chunk_size=10000, chunk_overlap=1000):
    """
    Splits documents into smaller chunks with overlap.

    Parameters:
        documents (List[Document]): List of LangChain Document objects.
        chunk_size (int): The maximum size of each chunk.
        chunk_overlap (int): The number of characters to overlap between chunks.

    Returns:
        List[Document]: List of chunked Document objects.
    """
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
    )
    split_docs = text_splitter.split_documents(documents)
    return split_docs

async def process_uploaded_file(
    id, file_path,
    rag_system=None, 
    llm_client=None, 
    llm_model=None
):
  
    try:
        # Load the document using LangChain
        documents = await load_file_with_markitdown(file_path, llm_client=llm_client, model=llm_model)
        logger.info(f"Loaded document: {file_path}")

         # Concatenate all pages to get the full document text for context generation
        # whole_document_content = "\n".join([doc.page_content for doc in documents])

    except Exception as e:
        logger.error(f"Failed to load document {file_path}: {e}")
        raise RuntimeError(f"Error loading document: {file_path}") from e

    # # Generate context for each chunk if llm is provided
    # if llm:
    #     for doc in split_docs:
    #         try:
    #             context = await llm.generate_context(doc, whole_document_content=whole_document_content)
    #             # Add context to the beginning of the page content
    #             doc.page_content = f"{context.replace('<|eot_id|>', '')}\n\n{doc.page_content}"
    #             logger.info(f"Context generated and added for chunk {split_docs.index(doc)}")
    #         except Exception as e:
    #             logger.error(f"Failed to generate context for chunk {split_docs.index(doc)}: {e}")
    #             raise RuntimeError(f"Error generating context for chunk {split_docs.index(doc)}") from e
            
    # Add to RAG system if rag_system is provided and load_only is False
    if rag_system:
        try:
            rag_system.add_document(doc_id = f"{id}_{documents.title}", text = documents.text_content)
           
            print(f"doc_id: {id}_{documents.title}")
            print(f"content: {documents.text_content}")

                        # print(f"New Page Content: {doc.page_content}")
            logger.info(f"Document chunks successfully added to RAG system for file {file_path}")

        except Exception as e:
            logger.error(f"Failed to add document chunks to RAG system for {file_path}: {e}")
            raise RuntimeError(f"Error adding document to RAG system: {file_path}") from e
    else:
        logger.info(f"Loaded document {file_path}, but not added to RAG system")

    return documents