Spaces:
Sleeping
Sleeping
# %% | |
from dotenv import load_dotenv | |
load_dotenv() | |
""" | |
# %% | |
import pandas as pd | |
df = pd.read_parquet( | |
"../raw_data/dale_carnegie/how_to_win_friends_and_influence_people.parquet" | |
) | |
df.head() | |
# %% | |
from langchain.schema import Document | |
documents = [] | |
for index, row in df.iterrows(): | |
doc = Document(page_content=row["text"]) | |
documents.append(doc) | |
documents | |
# %% | |
from autorag.utils import cast_corpus_dataset | |
from autorag.data.corpus import langchain_documents_to_parquet | |
corpus_df = langchain_documents_to_parquet(documents) | |
corpus_df = cast_corpus_dataset(corpus_df) | |
corpus_df.to_parquet("./data/corpus.parquet") | |
# %% | |
import nest_asyncio | |
nest_asyncio.apply() | |
import os | |
from llama_index.llms.openai import OpenAI | |
from autorag.data.qacreation import generate_qa_llama_index, make_single_content_qa | |
llm = OpenAI( | |
api_base=os.getenv("OPENAI_BASE_URL"), | |
model="gpt-4o", | |
) | |
qa_df = make_single_content_qa( | |
corpus_df, | |
content_size=49, | |
qa_creation_func=generate_qa_llama_index, | |
llm=llm, | |
question_num_per_content=1, | |
) | |
qa_df.to_parquet("./data/qa.parquet") | |
""" | |
# %% | |
import nest_asyncio | |
nest_asyncio.apply() | |
import autorag as ag | |
from autorag.evaluator import Evaluator | |
from llama_index.embeddings.huggingface import HuggingFaceEmbedding | |
ag.embedding_models["huggingface_baai_llm_embedder"] = HuggingFaceEmbedding( | |
"BAAI/llm-embedder" | |
) | |
ag.embedding_models["huggingface_baai_bge_large_en"] = HuggingFaceEmbedding( | |
"BAAI/bge-large-en-v1.5" | |
) | |
ag.embedding_models["huggingface_baai_bge_base_en"] = HuggingFaceEmbedding( | |
"BAAI/bge-base-en-v1.5" | |
) | |
ag.embedding_models["huggingface_baai_bge_small_en"] = HuggingFaceEmbedding( | |
"BAAI/bge-small-en-v1.5" | |
) | |
ag.embedding_models["huggingface_baai_bge_m3"] = HuggingFaceEmbedding("BAAI/bge-m3") | |
evaluator = Evaluator( | |
qa_data_path="./data/qa.parquet", | |
corpus_data_path="./data/corpus.parquet", | |
project_dir="./benchmark", | |
) | |
evaluator.start_trial("./config/config_small.yaml") | |
# %% | |