Cicero-semantic-search-merged-v1

Running

App Files Files Community

Cicero-semantic-search-merged-v1 / app.py

Rams901

Update app.py

968abba about 1 year ago

raw

history blame

No virus

4.06 kB

	import gradio as gr
	import numpy as np
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain.chains import LLMChain
	from langchain import PromptTemplate
	import re
	import pandas as pd
	from langchain.vectorstores import FAISS
	import requests
	from typing import List
	from langchain.schema import (
	SystemMessage,
	HumanMessage,
	AIMessage
	)
	import os
	from langchain.embeddings import HuggingFaceEmbeddings
	from langchain.chat_models import ChatOpenAI

	from langchain.llms.base import LLM
	from typing import Optional, List, Mapping, Any

	import ast
	from utils import ClaudeLLM, extract_website_name, remove_numbers

	embeddings = HuggingFaceEmbeddings()
	db = FAISS.load_local('db_full', embeddings)

	mp_docs = {}
	# llm = ClaudeLLM()
	# ChatOpenAI(
	# temperature=0,
	# model='gpt-3.5-turbo-16k'
	# )


	def add_text(history, text):

	print(history)
	history = history + [(text, None)]

	return history, ""

	# pipeline = {'claude': (ClaudeLLM(), 0), 'gpt-3.5': (ChatOpenAI(temperature=0,model='gpt-3.5-turbo-16k'), 65), 'gpt-4': (ChatOpenAI(temperature=0, model='gpt-4'), 30)}

	def retrieve_thoughts(query, n):

	# print(db.similarity_search_with_score(query = query, k = k, fetch_k = k*10))
	docs_with_score = db.similarity_search_with_score(query = query, k = len(db.index_to_docstore_id.values()), fetch_k = len(db.index_to_docstore_id.values()))
	df = pd.DataFrame([dict(doc[0])['metadata'] for doc in docs_with_score], )
	df = pd.concat((df, pd.DataFrame([dict(doc[0])['page_content'] for doc in docs_with_score], columns = ['page_content'])), axis = 1)
	df = pd.concat((df, pd.DataFrame([doc[1] for doc in docs_with_score], columns = ['score'])), axis = 1)

	# TO-DO: What if user query doesn't match what we provide as documents

	tier_1 = df[df['score'] < 0.95]
	tier_1 = tier_1[:min(len(tier_1),150)]
	# tier_2 = df[(df['score'] < 0.95) * (df["score"] > 0.7)]


	chunks_1 = tier_1.groupby(['title', 'url']).apply(lambda x: "\n...\n".join(x.sort_values('id')['page_content'].values)).values
	tier_1_adjusted = tier_1.groupby(['title', 'url']).first().reset_index()[['title', 'url', 'score']]
	tier_1_adjusted['ref'] = range(1, len(tier_1_adjusted) + 1 )
	tier_1_adjusted['content'] = chunks_1

	# chunks_2 = tier_2.groupby(['title', 'url', '_id']).apply(lambda x: "\n...\n".join(x.sort_values('id')['page_content'].values)).values
	# tier_2_adjusted = tier_2.groupby(['title', 'url', '_id']).first().reset_index()[['_id', 'title', 'url']]
	# tier_2_adjusted['content'] = chunks_2

	if n:
	tier_1_adjusted = tier_1_adjusted[:min(len(tier_1_adjusted), n)]

	print(len(tier_1_adjusted))
	# tier_1 = [doc[0] for doc in docs if ((doc[1] < 1))][:5]
	# tier_2 = [doc[0] for doc in docs if ((doc[1] > 0.7)*(doc[1] < 1.5))][10:15]

	return {'tier 1':tier_1_adjusted, }

	def qa_retrieve(query, llm):

	# llm = pipeline["claude"][0]

	docs = ""

	global db
	print(db)

	global mp_docs
	thoughts = retrieve_thoughts(query, 0)
	if not(thoughts):

	if mp_docs:
	thoughts = mp_docs
	else:
	mp_docs = thoughts

	tier_1 = thoughts['tier 1']
	# tier_2 = thoughts['tier 2']

	reference = tier_1[['ref', 'url', 'title', 'content','score']].to_dict('records')

	# tier_1 = list(tier_1.apply(lambda x: f"[{int(x['ref'])}] title: {x['title']}\n Content: {x.content}", axis = 1).values)
	# print(len(tier_1))
	# tier_2 = list(tier_2.apply(lambda x: f"title: {x['title']}\n Content: {x.content}", axis = 1).values)

	return {'Reference': reference}

	def flush():
	return None

	examples = [
	["Will Russia win the war in Ukraine?"],

	]

	demo = gr.Interface(fn=qa_retrieve, title="cicero-qa-api",
	inputs=gr.inputs.Textbox(lines=5, label="what would you like to learn about?"),
	outputs=[
	gr.components.JSON( label="Reference")],examples=examples)

	demo.queue(concurrency_count = 4)
	demo.launch()