|
from langchain_text_splitters import RecursiveCharacterTextSplitter |
|
from langchain.schema import Document |
|
from langchain_community.vectorstores import Chroma |
|
from langchain_openai import OpenAIEmbeddings |
|
import pandas as pd |
|
|
|
def load_csv_data(file_path: str): |
|
|
|
df = pd.read_csv(file_path) |
|
texts = "" |
|
for i in range(len(df)): |
|
texts += df.iloc[i]["content"] |
|
|
|
|
|
text_splitter = RecursiveCharacterTextSplitter( |
|
chunk_size=1000, |
|
chunk_overlap=200 |
|
) |
|
split_texts = text_splitter.split_text(texts) |
|
|
|
documents = [Document(page_content=text) for text in split_texts] |
|
|
|
return documents |
|
|
|
def create_vector_db(docs: list[Document]): |
|
persist_directory = "vector_db" |
|
vector_db = Chroma.from_documents(docs, embedding=OpenAIEmbeddings(), persist_directory=persist_directory) |
|
return vector_db |
|
|
|
if __name__ == "__main__": |
|
docs = load_csv_data("doc_ai/pdf_data.csv") |
|
vector_db = create_vector_db(docs) |
|
|