Spaces:
Runtime error
Runtime error
import chromadb | |
from langchain.document_loaders import PyPDFLoader | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from uuid import uuid4 | |
text_splitter = RecursiveCharacterTextSplitter( | |
chunk_size=800, | |
chunk_overlap=50 | |
) | |
client = chromadb.PersistentClient("test") | |
collection = client.create_collection("test_data") | |
def upload_pdf(file_path): | |
loader = PyPDFLoader(file_path) | |
pages = loader.load() | |
documents = [] | |
for page in pages: | |
docs = text_splitter.split_text(page.page_content) | |
for doc in docs: | |
documents.append({ | |
"text": docs, "meta_data": page.metadata, | |
}) | |
collection.add( | |
ids=[str(uuid4()) for _ in range(len(documents))], | |
documents=[doc['text'][0] for doc in documents], | |
metadatas=[doc['meta_data'] for doc in documents] | |
) | |
return f"PDF Uploaded Successfully. {collection.count()} chunks stored in ChromaDB" | |
# Define the Gradio interface | |
iface = gr.Interface( | |
fn=upload_pdf, | |
inputs=["file"], # Specify a file input component | |
outputs="textbox", # Display the output text in a textbox | |
title="Upload PDF to ChromaDB", | |
description="Upload a PDF file and store its text chunks in ChromaDB.", | |
) | |
# Launch the Gradio app | |
iface.launch(debug=True,share=True) |