Spaces:
Paused
Paused
from langchain_community.document_loaders import PyMuPDFLoader, TextLoader, WebBaseLoader | |
from langchain_community.vectorstores import Qdrant | |
import os | |
def process_file(file_or_url): | |
if isinstance(file_or_url, str) and file_or_url.startswith(('http://', 'https://')): | |
# Handle URL | |
loader = WebBaseLoader(file_or_url) | |
docs = loader.load() | |
documents.extend(docs) | |
# save the file temporarily | |
temp_file = "./"+file_or_url.path | |
with open(temp_file, "wb") as file: | |
file.write(file_or_url.content) | |
file_name = file_or_url.name | |
documents = [] | |
if file_or_url.path.endswith(".pdf"): | |
loader = PyMuPDFLoader(temp_file) | |
docs = loader.load() | |
documents.extend(docs) | |
else: | |
loader = TextLoader(temp_file) | |
docs = loader.load() | |
documents.extend(docs) | |
return documents | |
def add_to_qdrant(documents, embeddings, qdrant_client, collection_name): | |
Qdrant.from_documents( | |
documents, | |
embeddings, | |
url=qdrant_client.url, | |
prefer_grpc=True, | |
collection_name=collection_name, | |
) |