File size: 1,251 Bytes
ac318d7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
# importing required libraries
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
import wikipedia
import pandas as pd
from tqdm import tqdm

# reading names of the players in the data and displaying few of them
players = pd.read_csv("artifacts\data.csv", encoding = "latin-1")["Name"].to_list()

# extracting information about the players from their wikipedia pages
content = ""
for player in tqdm(players, desc = "Fetching Data : "):
  text = wikipedia.page(player, auto_suggest = False).content
  content += player.upper() + text + "\n"

# configuring the embedding function for the text chunks
model_name = "sentence-transformers/all-mpnet-base-v2"
embeddings = HuggingFaceEmbeddings(model_name = model_name)

# splitting the text into text chunks 
text_splitter = RecursiveCharacterTextSplitter(
    separators = [".", "\n"],
    chunk_size = 750,
    chunk_overlap = 125,
    length_function = len
)

# storing the text chunks into the vectorstore
documents = text_splitter.split_text(content)
vectorstore = FAISS.from_texts(documents, embeddings)

# saving the FAISS vectorstore
vectorstore.save_local("artifacts\FAISS-Vectorstore")