File size: 4,062 Bytes
734db66
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5f90b6d
734db66
 
 
 
 
 
 
 
 
 
 
 
 
5f90b6d
734db66
 
 
 
 
 
 
 
 
 
 
49a6ecc
 
 
734db66
 
f244e49
968abba
734db66
 
 
49a6ecc
 
 
734db66
 
 
 
 
 
 
 
6da8cb0
734db66
 
 
680eb62
734db66
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6da8cb0
734db66
5f90b6d
734db66
5f90b6d
 
 
734db66
5f90b6d
734db66
 
 
 
 
 
 
 
 
 
 
5f90b6d
734db66
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import gradio as gr
import numpy as np
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import LLMChain
from langchain import PromptTemplate
import re
import pandas as pd
from langchain.vectorstores import FAISS
import requests
from typing import List
from langchain.schema import (
    SystemMessage,
    HumanMessage,
    AIMessage
)
import os
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chat_models import ChatOpenAI

from langchain.llms.base import LLM
from typing import Optional, List, Mapping, Any

import ast
from utils import ClaudeLLM, extract_website_name, remove_numbers

embeddings = HuggingFaceEmbeddings()
db = FAISS.load_local('db_full', embeddings)

mp_docs = {}
# llm = ClaudeLLM()
# ChatOpenAI(
#             temperature=0,
#             model='gpt-3.5-turbo-16k'
#         )


def add_text(history, text):

    print(history)
    history = history + [(text, None)]

    return history, ""

# pipeline = {'claude': (ClaudeLLM(), 0), 'gpt-3.5': (ChatOpenAI(temperature=0,model='gpt-3.5-turbo-16k'), 65), 'gpt-4': (ChatOpenAI(temperature=0, model='gpt-4'), 30)}

def retrieve_thoughts(query, n):

    # print(db.similarity_search_with_score(query = query, k = k, fetch_k = k*10))
    docs_with_score = db.similarity_search_with_score(query = query, k = len(db.index_to_docstore_id.values()), fetch_k = len(db.index_to_docstore_id.values()))
    df = pd.DataFrame([dict(doc[0])['metadata'] for doc in docs_with_score], )
    df = pd.concat((df, pd.DataFrame([dict(doc[0])['page_content'] for doc in docs_with_score], columns = ['page_content'])), axis = 1)
    df = pd.concat((df, pd.DataFrame([doc[1] for doc in docs_with_score], columns = ['score'])), axis = 1)

  # TO-DO: What if user query doesn't match what we provide as documents

    tier_1 = df[df['score'] < 0.95]
    tier_1 = tier_1[:min(len(tier_1),150)]
    # tier_2 = df[(df['score'] < 0.95) * (df["score"] > 0.7)]


    chunks_1 = tier_1.groupby(['title', 'url']).apply(lambda x: "\n...\n".join(x.sort_values('id')['page_content'].values)).values
    tier_1_adjusted = tier_1.groupby(['title', 'url']).first().reset_index()[['title', 'url', 'score']]
    tier_1_adjusted['ref'] = range(1, len(tier_1_adjusted) + 1 )
    tier_1_adjusted['content'] = chunks_1

    # chunks_2 = tier_2.groupby(['title', 'url', '_id']).apply(lambda x: "\n...\n".join(x.sort_values('id')['page_content'].values)).values
    # tier_2_adjusted = tier_2.groupby(['title', 'url', '_id']).first().reset_index()[['_id', 'title', 'url']]
    # tier_2_adjusted['content'] = chunks_2

    if n:
      tier_1_adjusted = tier_1_adjusted[:min(len(tier_1_adjusted), n)]

    print(len(tier_1_adjusted))
  # tier_1 = [doc[0]  for doc in docs if ((doc[1] < 1))][:5]
  # tier_2 = [doc[0]  for doc in docs if ((doc[1] > 0.7)*(doc[1] < 1.5))][10:15]

    return {'tier 1':tier_1_adjusted, }

def qa_retrieve(query, llm):

    # llm = pipeline["claude"][0]

    docs = ""

    global db
    print(db)

    global mp_docs
    thoughts = retrieve_thoughts(query, 0)
    if not(thoughts):

        if mp_docs:
            thoughts = mp_docs
    else:
        mp_docs = thoughts

    tier_1 = thoughts['tier 1']
    # tier_2 = thoughts['tier 2']
    
    reference = tier_1[['ref', 'url', 'title', 'content','score']].to_dict('records')

    # tier_1 = list(tier_1.apply(lambda x: f"[{int(x['ref'])}] title: {x['title']}\n Content: {x.content}", axis = 1).values)
    # print(len(tier_1))
    # tier_2 = list(tier_2.apply(lambda x: f"title: {x['title']}\n Content: {x.content}", axis = 1).values)

    return {'Reference': reference}

def flush():
  return None

examples = [
    ["Will Russia win the war in Ukraine?"],

    ]

demo = gr.Interface(fn=qa_retrieve, title="cicero-qa-api",
                     inputs=gr.inputs.Textbox(lines=5, label="what would you like to learn about?"),
                     outputs=[
                             gr.components.JSON( label="Reference")],examples=examples)

demo.queue(concurrency_count = 4)
demo.launch()