Spaces:
Build error
Build error
File size: 5,264 Bytes
8d2f9d4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 |
# app/crud.py
from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader, CSVLoader, UnstructuredExcelLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sqlalchemy.future import select
from sqlalchemy.ext.asyncio import AsyncSession
from markitdown import MarkItDown
import os
import logging
from typing import List, Optional
# from app.db.models.docs import *
# from app.schemas.schemas import DocumentCreate, DocumentUpdate
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
async def load_file_with_markitdown(file_path:str, llm_client:str=None, model:str=None):
if llm_client and model:
markitdown = MarkItDown(llm_client, model)
documents = markitdown.convert(file_path)
else:
markitdown = MarkItDown()
documents = markitdown.convert(file_path)
return documents
async def load_pdf_with_langchain(file_path):
"""
Loads and extracts text from a PDF file using LangChain's PyPDFLoader.
Parameters:
file_path (str): Path to the PDF file.
Returns:
List[Document]: A list of LangChain Document objects with metadata.
"""
loader = PyPDFLoader(file_path, extract_images=True)
documents = loader.load()
return documents # Returns a list of Document objects
async def load_file_with_langchain(file_path: str):
"""
Loads and extracts text from a PDF or DOCX file using LangChain's appropriate loader.
Parameters:
file_path (str): Path to the file (PDF or DOCX).
Returns:
List[Document]: A list of LangChain Document objects with metadata.
"""
# Determine the file extension
_, file_extension = os.path.splitext(file_path)
# Choose the loader based on file extension
if file_extension.lower() == '.pdf':
loader = PyPDFLoader(file_path)
elif file_extension.lower() == '.docx':
loader = Docx2txtLoader(file_path)
elif file_extension.lower() == '.csv':
loader = CSVLoader(file_path)
elif file_extension.lower() == '.xlsx':
loader = UnstructuredExcelLoader(file_path)
else:
raise ValueError("Unsupported file format. Please provide a PDF or DOCX file.")
# Load the documents
documents = loader.load()
return documents
async def split_documents(documents, chunk_size=10000, chunk_overlap=1000):
"""
Splits documents into smaller chunks with overlap.
Parameters:
documents (List[Document]): List of LangChain Document objects.
chunk_size (int): The maximum size of each chunk.
chunk_overlap (int): The number of characters to overlap between chunks.
Returns:
List[Document]: List of chunked Document objects.
"""
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
)
split_docs = text_splitter.split_documents(documents)
return split_docs
async def process_uploaded_file(
id, file_path,
rag_system=None,
llm_client=None,
llm_model=None
):
try:
# Load the document using LangChain
documents = await load_file_with_markitdown(file_path, llm_client=llm_client, model=llm_model)
logger.info(f"Loaded document: {file_path}")
# Concatenate all pages to get the full document text for context generation
# whole_document_content = "\n".join([doc.page_content for doc in documents])
except Exception as e:
logger.error(f"Failed to load document {file_path}: {e}")
raise RuntimeError(f"Error loading document: {file_path}") from e
# # Generate context for each chunk if llm is provided
# if llm:
# for doc in split_docs:
# try:
# context = await llm.generate_context(doc, whole_document_content=whole_document_content)
# # Add context to the beginning of the page content
# doc.page_content = f"{context.replace('<|eot_id|>', '')}\n\n{doc.page_content}"
# logger.info(f"Context generated and added for chunk {split_docs.index(doc)}")
# except Exception as e:
# logger.error(f"Failed to generate context for chunk {split_docs.index(doc)}: {e}")
# raise RuntimeError(f"Error generating context for chunk {split_docs.index(doc)}") from e
# Add to RAG system if rag_system is provided and load_only is False
if rag_system:
try:
rag_system.add_document(doc_id = f"{id}_{documents.title}", text = documents.text_content)
print(f"doc_id: {id}_{documents.title}")
print(f"content: {documents.text_content}")
# print(f"New Page Content: {doc.page_content}")
logger.info(f"Document chunks successfully added to RAG system for file {file_path}")
except Exception as e:
logger.error(f"Failed to add document chunks to RAG system for {file_path}: {e}")
raise RuntimeError(f"Error adding document to RAG system: {file_path}") from e
else:
logger.info(f"Loaded document {file_path}, but not added to RAG system")
return documents
|