Koshti10's picture
Upload 57 files
3369d9f
raw
history blame
907 Bytes
# DEPRECATED - Use keypoints.py, to get combined answer
import pandas as pd
import os
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
folder = 'paper_csvs'
list_dirs = os.listdir(folder)
result = ''
for i in range(len(list_dirs)):
path = os.path.join(folder, list_dirs[i])
df = pd.read_csv(path)
result += str(df['response'].iloc[0])
print(len(result))
#21000 words - consultation reports
#12988 words - academic papers
# Split the texts
text_splitter = CharacterTextSplitter(
separator = "\n",
chunk_size = 1000,
chunk_overlap = 200,
length_function = len,
)
texts = text_splitter.split_text(result)
# Create Embedding
embedding = OpenAIEmbeddings()
db = FAISS.from_texts(texts, embedding)
# Save Embedding
db.save_local("paper_combined/faiss_index")