Rams901's picture
Update app.py
be7e270
raw
history blame contribute delete
No virus
7.48 kB
import gradio as gr
import numpy as np
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import LLMChain
from langchain import PromptTemplate
import re
import pandas as pd
from langchain.vectorstores import FAISS
import requests
from typing import List
from langchain.schema import (
SystemMessage,
HumanMessage,
AIMessage
)
import os
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain.llms.base import LLM
from typing import Optional, List, Mapping, Any
import ast
from utils import ClaudeLLM, extract_website_name, remove_numbers
embeddings = HuggingFaceEmbeddings()
db = FAISS.load_local('db_full', embeddings)
mp_docs = {}
llm = ClaudeLLM()
# ChatOpenAI(
# temperature=0,
# model='gpt-3.5-turbo-16k'
# )
def add_text(history, text):
print(history)
history = history + [(text, None)]
return history, ""
pipeline = {'claude': (ClaudeLLM(), 0), 'gpt-3.5': (ChatOpenAI(temperature=0,model='gpt-3.5-turbo-16k'), 65), 'gpt-4': (ChatOpenAI(temperature=0, model='gpt-4'), 30)}
def retrieve_thoughts(query, n):
# print(db.similarity_search_with_score(query = query, k = k, fetch_k = k*10))
docs_with_score = db.similarity_search_with_score(query = query, k = len(db.index_to_docstore_id.values()), fetch_k = len(db.index_to_docstore_id.values()))
df = pd.DataFrame([dict(doc[0])['metadata'] for doc in docs_with_score], )
df = pd.concat((df, pd.DataFrame([dict(doc[0])['page_content'] for doc in docs_with_score], columns = ['page_content'])), axis = 1)
df = pd.concat((df, pd.DataFrame([doc[1] for doc in docs_with_score], columns = ['score'])), axis = 1)
# TO-DO: What if user query doesn't match what we provide as documents
tier_1 = df[df['score'] < 0.7]
tier_2 = df[(df['score'] < 0.95) * (df["score"] > 0.7)]
chunks_1 = tier_1.groupby(['title', 'url', '_id']).apply(lambda x: "\n...\n".join(x.sort_values('id')['page_content'].values)).values
tier_1_adjusted = tier_1.groupby(['title', 'url', '_id']).first().reset_index()[['_id', 'title', 'url']]
tier_1_adjusted['ref'] = range(1, len(tier_1_adjusted) + 1 )
tier_1_adjusted['content'] = chunks_1
chunks_2 = tier_2.groupby(['title', 'url', '_id']).apply(lambda x: "\n...\n".join(x.sort_values('id')['page_content'].values)).values
tier_2_adjusted = tier_2.groupby(['title', 'url', '_id']).first().reset_index()[['_id', 'title', 'url']]
tier_2_adjusted['content'] = chunks_2
if n:
tier_1_adjusted = tier_1_adjusted[:min(len(tier_1_adjusted), n)]
print(len(tier_1_adjusted))
# tier_1 = [doc[0] for doc in docs if ((doc[1] < 1))][:5]
# tier_2 = [doc[0] for doc in docs if ((doc[1] > 0.7)*(doc[1] < 1.5))][10:15]
return {'tier 1':tier_1_adjusted, 'tier 2': tier_2.loc[:5]}
def qa_retrieve(query, llm):
llm = pipeline["claude"][0]
docs = ""
global db
print(db)
global mp_docs
thoughts = retrieve_thoughts(query, 0)
if not(thoughts):
if mp_docs:
thoughts = mp_docs
else:
mp_docs = thoughts
tier_1 = thoughts['tier 1']
tier_2 = thoughts['tier 2']
reference = tier_1[['ref', 'url', 'title']].to_dict('records')
tier_1 = list(tier_1.apply(lambda x: f"[{int(x['ref'])}] title: {x['title']}\n Content: {x.content}", axis = 1).values)
print(len(tier_1))
tier_2 = list(tier_2.apply(lambda x: f"title: {x['title']}\n Content: {x.content}", axis = 1).values)
print(f"QUERY: {query}\nTIER 1: {tier_1}\nTIER2: {tier_2}")
# print(f"DOCS RETRIEVED: {mp_docs.values}")
# Cynthesis Generation
session_prompt = """ A bot that is open to discussions about different cultural, philosophical and political exchanges. You will use do different analysis to the articles provided to me. Stay truthful and if you weren't provided any resources give your oppinion only."""
task = """Your primary responsibility is to identify multiple themes from the given articles. For each theme detected, you are to present it under three separate categories:
1. Theme Title - An easy-to-understand title that encapsulates the core idea of the theme extracted from the article.
2. Theme Description - An expanded elaboration that explores the theme in detail based on the arguments and points provided in the article.
3. Quotes related to theme - Locate and provide at least one compelling quote from the article that directly supports or showcases the theme you have identified. This quote should serve as a specific evidence or example from the article text that corresponds directly to the developed theme.
The extracted themes should be written in structured manner, ensuring clarity and meaningful correlation between the themes and the articles. Make sure your analysis is rooted in the arguments given in the article. Avoid including personal opinions or making generalizations that are not explicitly supported by the articles. """
prompt = PromptTemplate(
input_variables=["query", "task", "session_prompt", "articles"],
template="""
You are a {session_prompt}
{task}
query: {query}
Articles:
{articles}
The extracted themes should be written in structured manner, ensuring clarity and meaningful correlation between the themes and the articles. Make sure your analysis is rooted in the arguments given in the article. Avoid including personal opinions or making generalizations that are not explicitly supported by the articles.
""",
)
# llm = BardLLM()
chain = LLMChain(llm=llm, prompt = prompt)
response = chain.run(query=query, articles="\n".join(tier_1), session_prompt = session_prompt, task = task)
for i in range(5):
response = response.replace(f'[{i}]', f"<span class='text-primary'>[{i}]</span>")
# Generate related questions
prompt_q = PromptTemplate(
input_variables=[ "session_prompt", "articles"],
template="""
You are a {session_prompt}
Give general/global questions related the following articles:
Articles:
{articles}
Make sure not to ask specific questions, keep them general, short and concise.
""",
)
chain_q = LLMChain(llm=ClaudeLLM(), prompt = prompt_q)
questions = chain_q.run(session_prompt = session_prompt, articles = "\n".join(tier_2), )
print(questions)
questions = questions[questions.index('1'):]
questions = [ remove_numbers(t).strip() for (i, t) in enumerate(questions.split('.')) if len(t) > 5][:5]
print(questions)
# TO-DO: initiate models in another function, refactor code to be reusable
# json_resp = {'cynthesis': response, 'questions': questions, 'Reference': reference}
return response, {'Reference': reference}
def flush():
return None
examples = [
["Will Russia win the war in Ukraine?"],
]
demo = gr.Interface(fn=qa_retrieve, title="cicero-qa-api",
inputs=gr.inputs.Textbox(lines=5, label="what would you like to learn about?"),
outputs=[gr.components.Textbox(lines=3, label="Themes"),
gr.components.JSON( label="Reference")],examples=examples)
demo.queue(concurrency_count = 4)
demo.launch()