|
import gradio as gr |
|
import numpy as np |
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
from langchain.chains import LLMChain |
|
from langchain import PromptTemplate |
|
import re |
|
import pandas as pd |
|
from langchain.vectorstores import FAISS |
|
import requests |
|
from typing import List |
|
from langchain.schema import ( |
|
SystemMessage, |
|
HumanMessage, |
|
AIMessage |
|
) |
|
import os |
|
from langchain.embeddings import HuggingFaceEmbeddings |
|
from langchain.chat_models import ChatOpenAI |
|
|
|
from langchain.llms.base import LLM |
|
from typing import Optional, List, Mapping, Any |
|
|
|
import ast |
|
from utils import ClaudeLLM |
|
|
|
from qdrant_client import models, QdrantClient |
|
from sentence_transformers import SentenceTransformer |
|
|
|
|
|
|
|
|
|
mp_docs = {} |
|
|
|
qdrant = QdrantClient( |
|
"https://0a1b865d-8291-41ef-8c29-ca6c35e26391.us-east4-0.gcp.cloud.qdrant.io:6333", |
|
prefer_grpc=True, |
|
api_key=os.environ.get('Qdrant_Api_Key') |
|
) |
|
encoder = SentenceTransformer('BAAI/bge-large-en-v1.5') |
|
def q_retrieve_thoughts(query, n, db = "articles"): |
|
|
|
v_len = qdrant.get_collection(db).dict()['vectors_count'] |
|
hits = qdrant.search( |
|
collection_name="articles", |
|
query_vector=encoder.encode(query).tolist(), |
|
limit=v_len |
|
) |
|
df = pd.DataFrame.from_records([dict(hit) for hit in hits] ) |
|
payload = pd.DataFrame(list(df['payload'].values[:])) |
|
|
|
|
|
|
|
|
|
payload['score'] = df['score'] |
|
del df |
|
payload.sort_values('score', ascending = False, inplace = True) |
|
|
|
tier_1 = payload |
|
|
|
chunks_1 = tier_1.groupby(['_id', ]).apply(lambda x: "\n...\n".join(x.sort_values('id')['page_content'].values)).values |
|
score = tier_1.groupby(['_id', ]).apply(lambda x: x['score'].mean()).values |
|
|
|
tier_1_adjusted = tier_1.groupby(['_id', ]).first().reset_index()[['_id', 'title', 'url', 'author']] |
|
tier_1_adjusted['content'] = list(chunks_1) |
|
tier_1_adjusted['score'] = score |
|
tier_1_adjusted = tier_1_adjusted[tier_1_adjusted['score']>0.5] |
|
tier_1_adjusted.sort_values('score', ascending = False, inplace = True) |
|
return {'tier 1':tier_1_adjusted, } |
|
|
|
def retrieve_thoughts(query, n, db): |
|
|
|
|
|
|
|
|
|
docs_with_score = db.similarity_search_with_score(query = query, k = len(db.index_to_docstore_id.values()), fetch_k = len(db.index_to_docstore_id.values())) |
|
|
|
df = pd.DataFrame([dict(doc[0])['metadata'] for doc in docs_with_score], ) |
|
df = pd.concat((df, pd.DataFrame([dict(doc[0])['page_content'] for doc in docs_with_score], columns = ['page_content'])), axis = 1) |
|
df = pd.concat((df, pd.DataFrame([doc[1] for doc in docs_with_score], columns = ['score'])), axis = 1) |
|
df['_id'] = df['_id'].apply(lambda x: str(x)) |
|
df.sort_values("score", inplace = True) |
|
|
|
|
|
|
|
tier_1 = df[df['score'] < 1] |
|
|
|
|
|
chunks_1 = tier_1.groupby(['_id' ]).apply(lambda x: {f"chunk_{i}": row for i, row in enumerate(x.sort_values('id')[['id', 'score','page_content']].to_dict('records'))}).values |
|
tier_1_adjusted = tier_1.groupby(['_id']).first().reset_index()[['_id', 'title', 'author','url', 'score']] |
|
tier_1_adjusted['ref'] = range(1, len(tier_1_adjusted) + 1 ) |
|
tier_1_adjusted['chunks'] = list(chunks_1) |
|
score = tier_1.groupby(['_id' ]).apply(lambda x: x['score'].mean()).values |
|
tier_1_adjusted['score'] = score |
|
tier_1_adjusted.sort_values("score", inplace = True) |
|
|
|
|
|
tier_1_adjusted = tier_1_adjusted[:min(len(tier_1_adjusted), 10)] |
|
|
|
return {'tier 1':tier_1_adjusted, } |
|
|
|
def qa_retrieve_art(query,): |
|
|
|
docs = "" |
|
|
|
global db_art |
|
|
|
global mp_docs |
|
thoughts = q_retrieve_thoughts(query, 0) |
|
if not(thoughts): |
|
|
|
if mp_docs: |
|
thoughts = mp_docs |
|
else: |
|
mp_docs = thoughts |
|
|
|
tier_1 = thoughts['tier 1'] |
|
|
|
reference = tier_1[['_id', 'url', 'author', 'title', 'content', 'score']].to_dict('records') |
|
|
|
return {'Reference': reference} |
|
|
|
|
|
def qa_retrieve_yt(query,): |
|
|
|
docs = "" |
|
|
|
global db_yt |
|
|
|
global mp_docs |
|
thoughts = retrieve_thoughts(query, 0, db_yt) |
|
if not(thoughts): |
|
|
|
if mp_docs: |
|
thoughts = mp_docs |
|
else: |
|
mp_docs = thoughts |
|
|
|
tier_1 = thoughts['tier 1'] |
|
|
|
reference = tier_1[['_id', 'url', 'author', 'title', 'chunks', 'score']].to_dict('records') |
|
|
|
return {'Reference': reference} |
|
|
|
def flush(): |
|
return None |
|
|
|
|
|
|
|
ref_art = gr.Interface(fn=qa_retrieve_art, label="Articles", |
|
inputs=gr.inputs.Textbox(lines=5, label="what would you like to learn about?"), |
|
outputs = gr.components.JSON(label="articles")) |
|
|
|
|
|
|
|
demo = gr.Parallel( ref_art,) |
|
|
|
demo.launch() |