Spaces:

Adr740
/

AI4PE-STATIC

Runtime error

App Files Files Community

AI4PE-STATIC / app.py

Adr740

Update app.py

a68f50c verified 11 months ago

raw

history blame

10.5 kB

	import gradio as gr
	import os
	from functools import partial
	from config import file_id_htl_biotech, file_id_kamera_express, file_id_smart_sd, file_id_sunday_naturals
	import gdown
	import pandas as pd
	from hogwarts import get_answer
	from hogwats_gemini import get_answer as get_answer_gemini
	from evaluator import eval_answer
	import nltk
	nltk.download('punkt')
	os.system("apt update; yes \| apt-get install poppler-utils; yes \| ls")

	dico = {file_id_htl_biotech : {"name" : "htl-biotechnology", "data" : None},
	file_id_smart_sd : {"name" : "smart-sd", "data" : None},
	file_id_kamera_express : {"name" : "kamera-express", "data" : None},
	file_id_sunday_naturals : {"name" : "sunday-naturals", "data" : None}, }

	choices = ["htl-biotechnology",
	"smart-sd",
	"kamera-express",
	"sunday-naturals"]

	title = "AI4PE - Olivier and Adam \n contact: adamrida.ra@gmail.com or sp.olivier@hotmail.com"


	for file_id in dico:
	print("GOING FOR ", dico[file_id]["name"])
	download_url = f'https://drive.google.com/uc?id={file_id}'

	# Download the file using gdown
	output = 'downloaded_file.csv'
	gdown.download(download_url, output, quiet=False)

	# Read the CSV file into a DataFrame
	df = pd.read_csv(output, sep=";")[["content", "embeddings"]].replace("transcript_", "expert_meeting_notes_")
	dico[file_id]["data"] = df

	id_to_name_mapper = {
	file_id_htl_biotech : 'htl-biotechnology',
	file_id_smart_sd : 'smart-sd',
	file_id_kamera_express : 'kamera-express',
	file_id_sunday_naturals : 'sunday-naturals',
	}
	name_to_id_mapper = {
	'htl-biotechnology': file_id_htl_biotech,
	'smart-sd': file_id_smart_sd,
	'kamera-express': file_id_kamera_express,
	'sunday-naturals': file_id_sunday_naturals,
	}

	def get_list_files(company, dico=dico, name_to_id_mapper=name_to_id_mapper):
	pdfs = []
	web_pages = []
	transcript = []
	for ext in dico[name_to_id_mapper[company]]["data"].content.values:
	# break
	filename = ext.split("\n")[0]

	if "SOURCE: COMPANY WEBSITE" in ext:
	filename=filename.replace("https::", "").replace("https:", "").replace(".txt", "").replace(".com", " ").replace(".", " Page: ")
	web_pages.append(filename)
	if "SOURCE: PDF FILE" in ext:
	# nb_pdfs += 1
	filename = "SOURCE: UPLOADED PDF - " + ext.split("PATH_FILE =")[1].split("'}\"")[0].split("/pdfs/")[1].split("/png")[0]+".pdf"
	pdfs.append(filename)
	# break
	# ext
	pass
	if "SOURCE: NOTES FROM EXPERT CALL" in ext:
	# nb_expert_transcripts += 1
	filename = ext.replace("_1 copy", "").replace("transcript ", "Note #").replace("transcript_1", "Note #2").replace("transcript", "Note #1").replace(".txt", "").split("\n")[0]
	transcript.append(filename)
	pass
	# print(filename)
	pdfs_string = "## Uploaded PDF files: \n" + "\n\n".join(list(set(pdfs)))
	web_pages = "## Enriched from the web: \n" + "\n\n".join(list(set(web_pages)))
	transcript = "## Uploaded notes from expert calls: \n" + "\n\n".join(list(set(transcript)))
	return web_pages, pdfs_string, transcript

	def get_data_room_overview(company, dico = dico,name_to_id_mapper = name_to_id_mapper):
	nb_pdfs = 0
	nb_expert_transcripts = 0
	nb_web = 0
	for ext in dico[name_to_id_mapper[company]]["data"].content.values:
	if "SOURCE: COMPANY WEBSITE" in ext:
	nb_web += 1
	if "SOURCE: PDF FILE" in ext:
	nb_pdfs += 1
	if "SOURCE: NOTES FROM EXPERT CALL" in ext:
	nb_expert_transcripts += 1
	disp = f"""---
	### Overview of the data room
	Enriched data room with: Linkedin profile and company website

	Volumetry:
	- {nb_pdfs} passages from PDF files
	- {nb_web} passages from company website
	- {nb_expert_transcripts} passages from notes of expert calls
	"""

	sunday_naturals_web, sunday_naturals_pdfs, sunday_naturals_expert = get_list_files("sunday-naturals", dico, name_to_id_mapper)
	smart_sd_web, smart_sd_pdfs, smart_sd_expert, = get_list_files("smart-sd", dico, name_to_id_mapper)
	htl_biotech_web, htl_biotech_pdfs, htl_biotech_expert, = get_list_files("htl-biotechnology", dico, name_to_id_mapper)
	kamera_express_web, kamera_express_pdfs, kamera_express_expert =get_list_files("kamera-express", dico, name_to_id_mapper)

	return disp, sunday_naturals_web, sunday_naturals_pdfs,sunday_naturals_expert,smart_sd_web,smart_sd_pdfs,smart_sd_expert,htl_biotech_web,htl_biotech_pdfs,htl_biotech_expert,kamera_express_web,kamera_express_pdfs,kamera_express_expert


	def generate_chat_answer(company_name, query):

	df = dico[name_to_id_mapper[company_name]]["data"]
	response = get_answer(df, 15, query)
	print("=====> Evaluating answer quality...")
	eval_score = eval(eval_answer(query, response))
	eval_md = f"""
	### Evalation of how well the response answer the intial question

	Score of {eval_score["score"]}/5

	Rationale:

	{eval_score["rationale_based_on_scoring_rules"]}
	"""

	return response, eval_md

	def generate_chat_answer_gemini(company_name, query):

	df = dico[name_to_id_mapper[company_name]]["data"]
	content = df["content"].values
	response = get_answer_gemini(query, company_name, content)
	print("=====> Evaluating answer quality...")
	eval_score = eval(eval_answer(query, response))
	eval_md = f"""
	### Evalation of how well the response answer the intial question

	Score of {eval_score["score"]}/5

	Rationale:

	{eval_score["rationale_based_on_scoring_rules"]}
	"""

	return response, eval_md

	with gr.Blocks(title=title,theme='nota-ai/theme') as demo:
	gr.Markdown(f"## {title}")

	with gr.Row(equal_height=True):
	with gr.Column(scale=1):
	company_name = gr.Dropdown(choices=choices, label="Select company")

	submit_button = gr.Button(value="Load workspace")
	data_room_overview = gr.Markdown("---\n### Overview of the data room")
	with gr.Column(scale=6):
	with gr.Tab("Chat - Baseline"):
	with gr.Row():
	with gr.Column(scale=5):
	chat_input = gr.Textbox(placeholder="Chat input", lines=2, label="Retrieve anything from the dataroom")
	with gr.Column(scale=1):
	chat_submit_button = gr.Button(value="Submit")
	with gr.Accordion("Accuracy score", open=False):
	evaluator = gr.Markdown("Waiting for answer to evaluate...")
	chat_output = gr.Markdown("Waiting for question...")
	with gr.Tab("Chat - ICL", interactive=True):
	with gr.Row():
	with gr.Column(scale=5):
	chat_input_gemini = gr.Textbox(placeholder="Chat input", lines=2, label="Retrieve anything from the dataroom")
	with gr.Column(scale=1):
	chat_submit_button_gemini = gr.Button(value="Submit")
	with gr.Accordion("Accuracy score", open=False):
	evaluator_gemini = gr.Markdown("Waiting for answer to evaluate...")
	chat_output_gemini = gr.Markdown("Waiting for question...")
	with gr.Tab("Data", interactive = True):
	with gr.Tab("Sunday Naturals"):
	with gr.Row():
	with gr.Column():
	sunday_naturals_web = gr.Markdown("Sources obtained from website")
	with gr.Column():
	sunday_naturals_pdfs = gr.Markdown("Sources obtained from uploaded pdfs")
	# with gr.Column():
	sunday_naturals_expert = gr.Markdown("Sources obtained from expert call notes")
	pass
	with gr.Tab("Smart SD"):
	with gr.Row():
	with gr.Column():
	smart_sd_web = gr.Markdown("Sources obtained from website")
	with gr.Column():
	smart_sd_pdfs = gr.Markdown("Sources obtained from uploaded pdfs")
	# with gr.Column():
	smart_sd_expert = gr.Markdown("Sources obtained from expert call notes")
	pass
	with gr.Tab("HTL Biotech"):
	with gr.Row():
	with gr.Column():
	htl_biotech_web = gr.Markdown("Sources obtained from website")
	with gr.Column():
	htl_biotech_pdfs = gr.Markdown("Sources obtained from uploaded pdfs")
	# with gr.Column():
	htl_biotech_expert = gr.Markdown("Sources obtained from expert call notes")
	pass
	with gr.Tab("Kamera Express"):
	with gr.Row():
	with gr.Column():
	kamera_express_web = gr.Markdown("Sources obtained from website")
	with gr.Column():
	kamera_express_pdfs = gr.Markdown("Sources obtained from uploaded pdfs")
	# with gr.Column():
	kamera_express_expert = gr.Markdown("Sources obtained from expert call notes")
	pass
	with gr.Tab("Benchmark", interactive=False):
	pass

	fn = partial(get_data_room_overview)
	fn_chat = partial(generate_chat_answer)
	fn_chat_gemini = partial(generate_chat_answer_gemini)
	submit_button.click(fn=fn, inputs=[company_name], outputs=[
	data_room_overview,
	sunday_naturals_web,
	sunday_naturals_pdfs,
	sunday_naturals_expert,
	smart_sd_web,
	smart_sd_pdfs,
	smart_sd_expert,
	htl_biotech_web,
	htl_biotech_pdfs,
	htl_biotech_expert,
	kamera_express_web,
	kamera_express_pdfs,
	kamera_express_expert])


	chat_submit_button.click(fn=fn_chat, inputs=[company_name, chat_input], outputs=[chat_output, evaluator])
	chat_submit_button_gemini.click(fn=fn_chat_gemini, inputs=[company_name, chat_input_gemini], outputs=[chat_output_gemini, evaluator_gemini])
	login = os.environ.get("login")
	pwd = os.environ.get("pwd")

	demo.launch(max_threads=40, max_file_size="100mb",auth=(login, pwd))
	# demo.launch(max_threads=40, max_file_size="100mb")