Spaces:
Sleeping
Sleeping
# importing required libraries | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain.vectorstores import FAISS | |
from langchain.embeddings import HuggingFaceEmbeddings | |
import wikipedia | |
import pandas as pd | |
from tqdm import tqdm | |
# reading names of the players in the data and displaying few of them | |
players = pd.read_csv("artifacts\data.csv", encoding = "latin-1")["Name"].to_list() | |
# extracting information about the players from their wikipedia pages | |
content = "" | |
for player in tqdm(players, desc = "Fetching Data : "): | |
text = wikipedia.page(player, auto_suggest = False).content | |
content += player.upper() + text + "\n" | |
# configuring the embedding function for the text chunks | |
model_name = "sentence-transformers/all-mpnet-base-v2" | |
embeddings = HuggingFaceEmbeddings(model_name = model_name) | |
# splitting the text into text chunks | |
text_splitter = RecursiveCharacterTextSplitter( | |
separators = [".", "\n"], | |
chunk_size = 750, | |
chunk_overlap = 125, | |
length_function = len | |
) | |
# storing the text chunks into the vectorstore | |
documents = text_splitter.split_text(content) | |
vectorstore = FAISS.from_texts(documents, embeddings) | |
# saving the FAISS vectorstore | |
vectorstore.save_local("artifacts\FAISS-Vectorstore") |