|
import gradio as gr |
|
import numpy as np |
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
from langchain.chains import LLMChain |
|
from langchain import PromptTemplate |
|
import re |
|
import pandas as pd |
|
from langchain.vectorstores import FAISS |
|
import requests |
|
from typing import List |
|
from langchain.schema import ( |
|
SystemMessage, |
|
HumanMessage, |
|
AIMessage |
|
) |
|
import os |
|
from langchain.embeddings import HuggingFaceEmbeddings |
|
from langchain.chat_models import ChatOpenAI |
|
|
|
from langchain.llms.base import LLM |
|
from typing import Optional, List, Mapping, Any |
|
|
|
import ast |
|
from utils import ClaudeLLM, extract_website_name, remove_numbers |
|
|
|
embeddings = HuggingFaceEmbeddings() |
|
db = FAISS.load_local('db_full', embeddings) |
|
|
|
mp_docs = {} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def add_text(history, text): |
|
|
|
print(history) |
|
history = history + [(text, None)] |
|
|
|
return history, "" |
|
|
|
|
|
|
|
def retrieve_thoughts(query, n): |
|
|
|
|
|
docs_with_score = db.similarity_search_with_score(query = query, k = len(db.index_to_docstore_id.values()), fetch_k = len(db.index_to_docstore_id.values())) |
|
df = pd.DataFrame([dict(doc[0])['metadata'] for doc in docs_with_score], ) |
|
df = pd.concat((df, pd.DataFrame([dict(doc[0])['page_content'] for doc in docs_with_score], columns = ['page_content'])), axis = 1) |
|
df = pd.concat((df, pd.DataFrame([doc[1] for doc in docs_with_score], columns = ['score'])), axis = 1) |
|
|
|
|
|
|
|
tier_1 = df[df['score'] < 0.7] |
|
tier_2 = df[(df['score'] < 0.95) * (df["score"] > 0.7)] |
|
|
|
|
|
chunks_1 = tier_1.groupby(['title', 'url', '_id']).apply(lambda x: "\n...\n".join(x.sort_values('id')['page_content'].values)).values |
|
tier_1_adjusted = tier_1.groupby(['title', 'url', '_id']).first().reset_index()[['_id', 'title', 'url', 'score']] |
|
tier_1_adjusted['ref'] = range(1, len(tier_1_adjusted) + 1 ) |
|
tier_1_adjusted['content'] = chunks_1 |
|
|
|
chunks_2 = tier_2.groupby(['title', 'url', '_id']).apply(lambda x: "\n...\n".join(x.sort_values('id')['page_content'].values)).values |
|
tier_2_adjusted = tier_2.groupby(['title', 'url', '_id']).first().reset_index()[['_id', 'title', 'url']] |
|
tier_2_adjusted['content'] = chunks_2 |
|
|
|
if n: |
|
tier_1_adjusted = tier_1_adjusted[:min(len(tier_1_adjusted), n)] |
|
|
|
print(len(tier_1_adjusted)) |
|
|
|
|
|
|
|
return {'tier 1':tier_1_adjusted, 'tier 2': tier_2.loc[:5]} |
|
|
|
def qa_retrieve(query, llm): |
|
|
|
|
|
|
|
docs = "" |
|
|
|
global db |
|
print(db) |
|
|
|
global mp_docs |
|
thoughts = retrieve_thoughts(query, 0) |
|
if not(thoughts): |
|
|
|
if mp_docs: |
|
thoughts = mp_docs |
|
else: |
|
mp_docs = thoughts |
|
|
|
tier_1 = thoughts['tier 1'] |
|
tier_2 = thoughts['tier 2'] |
|
|
|
reference = tier_1[['ref', 'url', 'title', 'content','score']].to_dict('records') |
|
|
|
|
|
|
|
|
|
|
|
return {'Reference': reference} |
|
|
|
def flush(): |
|
return None |
|
|
|
examples = [ |
|
["Will Russia win the war in Ukraine?"], |
|
|
|
] |
|
|
|
demo = gr.Interface(fn=qa_retrieve, title="cicero-qa-api", |
|
inputs=gr.inputs.Textbox(lines=5, label="what would you like to learn about?"), |
|
outputs=[ |
|
gr.components.JSON( label="Reference")],examples=examples) |
|
|
|
demo.queue(concurrency_count = 4) |
|
demo.launch() |
|
|
|
|