Cicero-QA-themes

Runtime error

App Files Files Community

Cicero-QA-themes / app.py

Rams901

Update app.py

be7e270 about 1 year ago

raw

history blame contribute delete

No virus

7.48 kB

	import gradio as gr
	import numpy as np
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain.chains import LLMChain
	from langchain import PromptTemplate
	import re
	import pandas as pd
	from langchain.vectorstores import FAISS
	import requests
	from typing import List
	from langchain.schema import (
	SystemMessage,
	HumanMessage,
	AIMessage
	)
	import os
	from langchain.embeddings import HuggingFaceEmbeddings
	from langchain.chat_models import ChatOpenAI

	from langchain.llms.base import LLM
	from typing import Optional, List, Mapping, Any

	import ast
	from utils import ClaudeLLM, extract_website_name, remove_numbers

	embeddings = HuggingFaceEmbeddings()
	db = FAISS.load_local('db_full', embeddings)

	mp_docs = {}
	llm = ClaudeLLM()
	# ChatOpenAI(
	# temperature=0,
	# model='gpt-3.5-turbo-16k'
	# )


	def add_text(history, text):

	print(history)
	history = history + [(text, None)]

	return history, ""

	pipeline = {'claude': (ClaudeLLM(), 0), 'gpt-3.5': (ChatOpenAI(temperature=0,model='gpt-3.5-turbo-16k'), 65), 'gpt-4': (ChatOpenAI(temperature=0, model='gpt-4'), 30)}

	def retrieve_thoughts(query, n):

	# print(db.similarity_search_with_score(query = query, k = k, fetch_k = k*10))
	docs_with_score = db.similarity_search_with_score(query = query, k = len(db.index_to_docstore_id.values()), fetch_k = len(db.index_to_docstore_id.values()))
	df = pd.DataFrame([dict(doc[0])['metadata'] for doc in docs_with_score], )
	df = pd.concat((df, pd.DataFrame([dict(doc[0])['page_content'] for doc in docs_with_score], columns = ['page_content'])), axis = 1)
	df = pd.concat((df, pd.DataFrame([doc[1] for doc in docs_with_score], columns = ['score'])), axis = 1)

	# TO-DO: What if user query doesn't match what we provide as documents

	tier_1 = df[df['score'] < 0.7]
	tier_2 = df[(df['score'] < 0.95) * (df["score"] > 0.7)]


	chunks_1 = tier_1.groupby(['title', 'url', '_id']).apply(lambda x: "\n...\n".join(x.sort_values('id')['page_content'].values)).values
	tier_1_adjusted = tier_1.groupby(['title', 'url', '_id']).first().reset_index()[['_id', 'title', 'url']]
	tier_1_adjusted['ref'] = range(1, len(tier_1_adjusted) + 1 )
	tier_1_adjusted['content'] = chunks_1

	chunks_2 = tier_2.groupby(['title', 'url', '_id']).apply(lambda x: "\n...\n".join(x.sort_values('id')['page_content'].values)).values
	tier_2_adjusted = tier_2.groupby(['title', 'url', '_id']).first().reset_index()[['_id', 'title', 'url']]
	tier_2_adjusted['content'] = chunks_2

	if n:
	tier_1_adjusted = tier_1_adjusted[:min(len(tier_1_adjusted), n)]

	print(len(tier_1_adjusted))
	# tier_1 = [doc[0] for doc in docs if ((doc[1] < 1))][:5]
	# tier_2 = [doc[0] for doc in docs if ((doc[1] > 0.7)*(doc[1] < 1.5))][10:15]

	return {'tier 1':tier_1_adjusted, 'tier 2': tier_2.loc[:5]}

	def qa_retrieve(query, llm):

	llm = pipeline["claude"][0]

	docs = ""

	global db
	print(db)

	global mp_docs
	thoughts = retrieve_thoughts(query, 0)
	if not(thoughts):

	if mp_docs:
	thoughts = mp_docs
	else:
	mp_docs = thoughts

	tier_1 = thoughts['tier 1']
	tier_2 = thoughts['tier 2']

	reference = tier_1[['ref', 'url', 'title']].to_dict('records')

	tier_1 = list(tier_1.apply(lambda x: f"[{int(x['ref'])}] title: {x['title']}\n Content: {x.content}", axis = 1).values)
	print(len(tier_1))
	tier_2 = list(tier_2.apply(lambda x: f"title: {x['title']}\n Content: {x.content}", axis = 1).values)

	print(f"QUERY: {query}\nTIER 1: {tier_1}\nTIER2: {tier_2}")
	# print(f"DOCS RETRIEVED: {mp_docs.values}")

	# Cynthesis Generation
	session_prompt = """ A bot that is open to discussions about different cultural, philosophical and political exchanges. You will use do different analysis to the articles provided to me. Stay truthful and if you weren't provided any resources give your oppinion only."""
	task = """Your primary responsibility is to identify multiple themes from the given articles. For each theme detected, you are to present it under three separate categories:

	1. Theme Title - An easy-to-understand title that encapsulates the core idea of the theme extracted from the article.

	2. Theme Description - An expanded elaboration that explores the theme in detail based on the arguments and points provided in the article.

	3. Quotes related to theme - Locate and provide at least one compelling quote from the article that directly supports or showcases the theme you have identified. This quote should serve as a specific evidence or example from the article text that corresponds directly to the developed theme.

	The extracted themes should be written in structured manner, ensuring clarity and meaningful correlation between the themes and the articles. Make sure your analysis is rooted in the arguments given in the article. Avoid including personal opinions or making generalizations that are not explicitly supported by the articles. """


	prompt = PromptTemplate(
	input_variables=["query", "task", "session_prompt", "articles"],
	template="""
	You are a {session_prompt}
	{task}

	query: {query}

	Articles:
	{articles}


	The extracted themes should be written in structured manner, ensuring clarity and meaningful correlation between the themes and the articles. Make sure your analysis is rooted in the arguments given in the article. Avoid including personal opinions or making generalizations that are not explicitly supported by the articles.

	""",
	)


	# llm = BardLLM()
	chain = LLMChain(llm=llm, prompt = prompt)

	response = chain.run(query=query, articles="\n".join(tier_1), session_prompt = session_prompt, task = task)

	for i in range(5):
	response = response.replace(f'[{i}]', f"<span class='text-primary'>[{i}]</span>")

	# Generate related questions
	prompt_q = PromptTemplate(
	input_variables=[ "session_prompt", "articles"],
	template="""
	You are a {session_prompt}
	Give general/global questions related the following articles:

	Articles:
	{articles}


	Make sure not to ask specific questions, keep them general, short and concise.
	""",
	)

	chain_q = LLMChain(llm=ClaudeLLM(), prompt = prompt_q)

	questions = chain_q.run(session_prompt = session_prompt, articles = "\n".join(tier_2), )
	print(questions)
	questions = questions[questions.index('1'):]

	questions = [ remove_numbers(t).strip() for (i, t) in enumerate(questions.split('.')) if len(t) > 5][:5]
	print(questions)

	# TO-DO: initiate models in another function, refactor code to be reusable

	# json_resp = {'cynthesis': response, 'questions': questions, 'Reference': reference}

	return response, {'Reference': reference}

	def flush():
	return None

	examples = [
	["Will Russia win the war in Ukraine?"],

	]

	demo = gr.Interface(fn=qa_retrieve, title="cicero-qa-api",
	inputs=gr.inputs.Textbox(lines=5, label="what would you like to learn about?"),
	outputs=[gr.components.Textbox(lines=3, label="Themes"),
	gr.components.JSON( label="Reference")],examples=examples)

	demo.queue(concurrency_count = 4)
	demo.launch()