Spaces:

lingyit1108
/

ragtest-sakimilo

Running

App Files Files Community

ragtest-sakimilo / main.py

lingyit1108

added code to check time cost

00561ea 10 months ago

raw

history blame

2.89 kB

	import utils
	import os

	import openai
	from llama_index import SimpleDirectoryReader
	from llama_index import Document
	from llama_index import VectorStoreIndex
	from llama_index import ServiceContext
	from llama_index.llms import OpenAI

	from llama_index.embeddings import HuggingFaceEmbedding
	from trulens_eval import Tru

	from utils import get_prebuilt_trulens_recorder
	import time

	openai.api_key = utils.get_openai_api_key()

	def main():

	if not os.path.exists("./default.sqlite"):

	start_time = time.time()
	file_ls_str = ", ".join(os.listdir("./raw_documents"))
	print(f"File list: {file_ls_str}")
	print("")
	documents = SimpleDirectoryReader(
	input_files=["./raw_documents/HI_Knowledge_Base.pdf"]
	).load_data()

	document = Document(text="\n\n".join([doc.text for doc in documents]))

	### gpt-4-1106-preview
	### gpt-3.5-turbo-1106 / gpt-3.5-turbo
	print("Initializing GPT 3.5 ..")
	llm = OpenAI(model="gpt-3.5-turbo-1106", temperature=0.1)

	print("Initializing bge-small-en-v1.5 embedding model ..")
	embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

	print("Creating vector store ..")
	print("time spent:", time.time() - start_time)
	service_context = ServiceContext.from_defaults(llm=llm, embed_model=embed_model)
	index = VectorStoreIndex.from_documents([document], service_context=service_context)

	query_engine = index.as_query_engine()

	separator = "\n\n"
	eval_questions = []
	with open('./raw_documents/eval_questions.txt', 'r') as file:
	content = file.read()

	for question in content.split(separator):
	print(question)
	print(separator)
	eval_questions.append(question.strip())

	response = query_engine.query(eval_questions[0])
	print(str(response))

	tru = Tru()
	# tru.reset_database()

	tru_recorder = get_prebuilt_trulens_recorder(query_engine,
	app_id="Direct Query Engine")

	print("Sending each question to llm ..")
	with tru_recorder as recording:
	for question in eval_questions:
	response = query_engine.query(question)

	records, feedback = tru.get_records_and_feedback(app_ids=[])

	os.makedirs("./results", exist_ok=True)
	records.to_csv("./results/records.csv", index=False)

	print(tru.db.engine.url.render_as_string(hide_password=False))

	end_time = time.time()
	time_spent_mins = (end_time - start_time) / 60
	with open("./results/time_cost.txt", "w") as fp:
	fp.write(f"Takes {int(time_spent_mins)} mins to create llm evaluation.")

	# tru.run_dashboard()

	if __name__ == "__main__":

	main()