Rams901's picture
Update app.py
e72ac74
raw
history blame
5.34 kB
import gradio as gr
import numpy as np
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import LLMChain
from langchain import PromptTemplate
import re
import pandas as pd
from langchain.vectorstores import FAISS
import requests
from typing import List
from langchain.schema import (
SystemMessage,
HumanMessage,
AIMessage
)
import os
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain.llms.base import LLM
from typing import Optional, List, Mapping, Any
import ast
from utils import ClaudeLLM
from qdrant_client import models, QdrantClient
from sentence_transformers import SentenceTransformer
# embeddings = HuggingFaceEmbeddings()
# db_art = FAISS.load_local('db_art', embeddings)
# db_yt = FAISS.load_local('db_yt', embeddings)
mp_docs = {}
qdrant = QdrantClient(
"https://0a1b865d-8291-41ef-8c29-ca6c35e26391.us-east4-0.gcp.cloud.qdrant.io:6333",
prefer_grpc=True,
api_key=os.env['Qdrant_Api_Key']
)
encoder = SentenceTransformer('BAAI/bge-large-en-v1.5')
def q_retrieve_thoughts(query, n, db = "articles"):
qdrant.search(
collection_name="articles",
query_vector=encoder.encode("Will Russia win the war in Ukraine?").tolist(),
limit=4000 # TO-DO: know the right number of thoughts existing maybe using get_collection
)
df = pd.DataFrame.from_records([dict(hit) for hit in hits] )
payload = pd.DataFrame(list(df['payload'].values[:]))
# payload.rename(columns = ['id': 'order_id'])
# payload['id'] = df['id']
payload['score'] = df['score']
del df
payload.sort_values('score', ascending = False, inplace = True)
tier_1 = payload
chunks_1 = tier_1.groupby(['_id', ]).apply(lambda x: "\n...\n".join(x.sort_values('id')['page_content'].values)).values
score = tier_1.groupby(['_id', ]).apply(lambda x: x['score'].mean()).values
tier_1_adjusted = tier_1.groupby(['_id', ]).first().reset_index()[[ 'title', 'url', 'author']]
tier_1_adjusted['content'] = list(chunks_1)
tier_1_adjusted['score'] = score
tier_1_adjusted = tier_1_adjusted[tier_1_adjusted['score']>0.5]
tier_1_adjusted.sort_values('score', ascending = False, inplace = True)
return {'tier 1':tier_1_adjusted, }
def retrieve_thoughts(query, n, db):
# print(db.similarity_search_with_score(query = query, k = k, fetch_k = k*10))
#filter = {'Product Name': prod}
docs_with_score = db.similarity_search_with_score(query = query, k = len(db.index_to_docstore_id.values()), fetch_k = len(db.index_to_docstore_id.values()))
df = pd.DataFrame([dict(doc[0])['metadata'] for doc in docs_with_score], )
df = pd.concat((df, pd.DataFrame([dict(doc[0])['page_content'] for doc in docs_with_score], columns = ['page_content'])), axis = 1)
df = pd.concat((df, pd.DataFrame([doc[1] for doc in docs_with_score], columns = ['score'])), axis = 1)
df['_id'] = df['_id'].apply(lambda x: str(x))
df.sort_values("score", inplace = True)
# TO-DO: What if user query doesn't match what we provide as documents
tier_1 = df[df['score'] < 1]
chunks_1 = tier_1.groupby(['_id' ]).apply(lambda x: {f"chunk_{i}": row for i, row in enumerate(x.sort_values('id')[['id', 'score','page_content']].to_dict('records'))}).values
tier_1_adjusted = tier_1.groupby(['_id']).first().reset_index()[['_id', 'title', 'author','url', 'score']]
tier_1_adjusted['ref'] = range(1, len(tier_1_adjusted) + 1 )
tier_1_adjusted['chunks'] = list(chunks_1)
score = tier_1.groupby(['_id' ]).apply(lambda x: x['score'].mean()).values
tier_1_adjusted['score'] = score
tier_1_adjusted.sort_values("score", inplace = True)
tier_1_adjusted = tier_1_adjusted[:min(len(tier_1_adjusted), 10)]
return {'tier 1':tier_1_adjusted, }
def qa_retrieve_art(query,):
docs = ""
global db_art
global mp_docs
thoughts = q_retrieve_thoughts(query, 0)
if not(thoughts):
if mp_docs:
thoughts = mp_docs
else:
mp_docs = thoughts
tier_1 = thoughts['tier 1']
reference = tier_1[['_id', 'url', 'author', 'title', 'chunks', 'score']].to_dict('records')
return {'Reference': reference}
def qa_retrieve_yt(query,):
docs = ""
global db_yt
global mp_docs
thoughts = retrieve_thoughts(query, 0, db_yt)
if not(thoughts):
if mp_docs:
thoughts = mp_docs
else:
mp_docs = thoughts
tier_1 = thoughts['tier 1']
reference = tier_1[['_id', 'url', 'author', 'title', 'chunks', 'score']].to_dict('records')
return {'Reference': reference}
def flush():
return None
examples = [
["Will Russia win the war in Ukraine?"],
]
ref_art = gr.Interface(fn=qa_retrieve_art, label="Articles",
inputs=gr.inputs.Textbox(lines=5, label="what would you like to learn about?"),
outputs = gr.components.JSON(label="articles"), examples=examples)
ref_yt = gr.Interface(fn=qa_retrieve_yt, label="Youtube",
inputs=gr.inputs.Textbox(lines=5, label="what would you like to learn about?"),
outputs = gr.components.JSON(label="youtube"),title = "youtube", examples=examples)
demo = gr.Parallel( ref_art,)
demo.launch()