import gradio as gr |
import numpy as np |
from langchain.text_splitter import RecursiveCharacterTextSplitter |
from langchain.chains import LLMChain |
from langchain import PromptTemplate |
import re |
import pandas as pd |
from langchain.vectorstores import FAISS |
import requests |
from typing import List |
from langchain.schema import ( |
SystemMessage, |
HumanMessage, |
AIMessage |
) |
import os |
from langchain.embeddings import HuggingFaceEmbeddings |
from langchain.chat_models import ChatOpenAI |
from langchain.llms.base import LLM |
from typing import Optional, List, Mapping, Any |
import ast |
from utils import ClaudeLLM, extract_website_name, remove_numbers |
embeddings = HuggingFaceEmbeddings() |
db = FAISS.load_local('db_full', embeddings) |
mp_docs = {} |
def add_text(history, text): |
print(history) |
history = history + [(text, None)] |
return history, "" |
def retrieve_thoughts(query, n): |
docs_with_score = db.similarity_search_with_score(query = query, k = len(db.index_to_docstore_id.values()), fetch_k = len(db.index_to_docstore_id.values())) |
df = pd.DataFrame([dict(doc[0])['metadata'] for doc in docs_with_score], ) |
df = pd.concat((df, pd.DataFrame([dict(doc[0])['page_content'] for doc in docs_with_score], columns = ['page_content'])), axis = 1) |
df = pd.concat((df, pd.DataFrame([doc[1] for doc in docs_with_score], columns = ['score'])), axis = 1) |
tier_1 = df[df['score'] < 0.7] |
tier_2 = df[(df['score'] < 0.95) * (df["score"] > 0.7)] |
chunks_1 = tier_1.groupby(['title', 'url', '_id']).apply(lambda x: "\n...\n".join(x.sort_values('id')['page_content'].values)).values |
tier_1_adjusted = tier_1.groupby(['title', 'url', '_id']).first().reset_index()[['_id', 'title', 'url', 'score']] |
tier_1_adjusted['ref'] = range(1, len(tier_1_adjusted) + 1 ) |
tier_1_adjusted['content'] = chunks_1 |
chunks_2 = tier_2.groupby(['title', 'url', '_id']).apply(lambda x: "\n...\n".join(x.sort_values('id')['page_content'].values)).values |
tier_2_adjusted = tier_2.groupby(['title', 'url', '_id']).first().reset_index()[['_id', 'title', 'url']] |
tier_2_adjusted['content'] = chunks_2 |
if n: |
tier_1_adjusted = tier_1_adjusted[:min(len(tier_1_adjusted), n)] |
print(len(tier_1_adjusted)) |
return {'tier 1':tier_1_adjusted, 'tier 2': tier_2.loc[:5]} |
def qa_retrieve(query, llm): |
docs = "" |
global db |
print(db) |
global mp_docs |
thoughts = retrieve_thoughts(query, 0) |
if not(thoughts): |
if mp_docs: |
thoughts = mp_docs |
else: |
mp_docs = thoughts |
tier_1 = thoughts['tier 1'] |
tier_2 = thoughts['tier 2'] |
reference = tier_1[['ref', 'url', 'title', 'content','score']].to_dict('records') |
return {'Reference': reference} |
def flush(): |
return None |
examples = [ |
["Will Russia win the war in Ukraine?"], |
] |
demo = gr.Interface(fn=qa_retrieve, title="cicero-qa-api", |
inputs=gr.inputs.Textbox(lines=5, label="what would you like to learn about?"), |
outputs=[ |
gr.components.JSON( label="Reference")],examples=examples) |
demo.queue(concurrency_count = 4) |
demo.launch() |