FootballRAG / faiss_setup.py
Rauhan's picture
Adding Files
ac318d7
raw
history blame
1.25 kB
# importing required libraries
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
import wikipedia
import pandas as pd
from tqdm import tqdm
# reading names of the players in the data and displaying few of them
players = pd.read_csv("artifacts\data.csv", encoding = "latin-1")["Name"].to_list()
# extracting information about the players from their wikipedia pages
content = ""
for player in tqdm(players, desc = "Fetching Data : "):
text = wikipedia.page(player, auto_suggest = False).content
content += player.upper() + text + "\n"
# configuring the embedding function for the text chunks
model_name = "sentence-transformers/all-mpnet-base-v2"
embeddings = HuggingFaceEmbeddings(model_name = model_name)
# splitting the text into text chunks
text_splitter = RecursiveCharacterTextSplitter(
separators = [".", "\n"],
chunk_size = 750,
chunk_overlap = 125,
length_function = len
)
# storing the text chunks into the vectorstore
documents = text_splitter.split_text(content)
vectorstore = FAISS.from_texts(documents, embeddings)
# saving the FAISS vectorstore
vectorstore.save_local("artifacts\FAISS-Vectorstore")