Robby-chatbot / modules /embedder.py
ClearLove443
update
3cc0f37
raw
history blame
2.79 kB
import os
import pickle
import tempfile
from langchain.document_loaders import PyPDFLoader, TextLoader
from langchain.document_loaders.csv_loader import CSVLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
class Embedder:
def __init__(self):
self.PATH = "embeddings"
self.createEmbeddingsDir()
def createEmbeddingsDir(self):
"""
Creates a directory to store the embeddings vectors
"""
if not os.path.exists(self.PATH):
os.mkdir(self.PATH)
def storeDocEmbeds(self, file, original_filename):
"""
Stores document embeddings using Langchain and FAISS
"""
with tempfile.NamedTemporaryFile(mode="wb", delete=False) as tmp_file:
tmp_file.write(file)
tmp_file_path = tmp_file.name
def get_file_extension(uploaded_file):
file_extension = os.path.splitext(uploaded_file)[1].lower()
return file_extension
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=2000,
chunk_overlap=100,
length_function=len,
)
file_extension = get_file_extension(original_filename)
if file_extension == ".csv":
loader = CSVLoader(
file_path=tmp_file_path,
encoding="utf-8",
csv_args={
"delimiter": ",",
},
)
data = loader.load()
elif file_extension == ".pdf":
loader = PyPDFLoader(file_path=tmp_file_path)
data = loader.load_and_split(text_splitter)
elif file_extension == ".txt":
loader = TextLoader(file_path=tmp_file_path, encoding="utf-8")
data = loader.load_and_split(text_splitter)
# embeddings = OpenAIEmbeddings()
from langchain.embeddings import HuggingFaceEmbeddings
modelpath = "intfloat/e5-large-v2"
embeddings = HuggingFaceEmbeddings(model_name=modelpath)
vectors = FAISS.from_documents(data, embeddings)
os.remove(tmp_file_path)
# Save the vectors to a pickle file
with open(f"{self.PATH}/{original_filename}.pkl", "wb") as f:
pickle.dump(vectors, f)
def getDocEmbeds(self, file, original_filename):
"""
Retrieves document embeddings
"""
if not os.path.isfile(f"{self.PATH}/{original_filename}.pkl"):
self.storeDocEmbeds(file, original_filename)
# Load the vectors from the pickle file
with open(f"{self.PATH}/{original_filename}.pkl", "rb") as f:
vectors = pickle.load(f)
return vectors