PDFQueryBot / upload_pdf.py
aakash0563's picture
Update upload_pdf.py
9063322
import chromadb
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from uuid import uuid4
import gradio as gr
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=800,
chunk_overlap=50
)
client = chromadb.PersistentClient("test")
collection = client.create_collection("test_data")
def upload_pdf(file_path):
loader = PyPDFLoader(file_path)
pages = loader.load()
documents = []
for page in pages:
docs = text_splitter.split_text(page.page_content)
for doc in docs:
documents.append({
"text": docs, "meta_data": page.metadata,
})
collection.add(
ids=[str(uuid4()) for _ in range(len(documents))],
documents=[doc['text'][0] for doc in documents],
metadatas=[doc['meta_data'] for doc in documents]
)
return f"PDF Uploaded Successfully. {collection.count()} chunks stored in ChromaDB"
# Define the Gradio interface
iface = gr.Interface(
fn=upload_pdf,
inputs=["file"], # Specify a file input component
outputs="textbox", # Display the output text in a textbox
title="Upload PDF to ChromaDB",
description="Upload a PDF file and store its text chunks in ChromaDB.",
)
# Launch the Gradio app
iface.launch(debug=True,share=True)