Spaces:
Runtime error
Runtime error
Rams901
commited on
Commit
•
734db66
0
Parent(s):
Duplicate from Rams901/Cicero-QA-themes
Browse files- .gitattributes +35 -0
- README.md +13 -0
- app.py +192 -0
- db_full/index.faiss +3 -0
- db_full/index.pkl +3 -0
- requirements.txt +9 -0
- utils.py +49 -0
.gitattributes
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
35 |
+
db_full/index.faiss filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: Cicero Interactive QA Dev
|
3 |
+
emoji: 🏃
|
4 |
+
colorFrom: green
|
5 |
+
colorTo: gray
|
6 |
+
sdk: gradio
|
7 |
+
sdk_version: 3.23.0
|
8 |
+
app_file: app.py
|
9 |
+
pinned: false
|
10 |
+
duplicated_from: Rams901/Cicero-QA-themes
|
11 |
+
---
|
12 |
+
|
13 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
ADDED
@@ -0,0 +1,192 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import numpy as np
|
3 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
4 |
+
from langchain.chains import LLMChain
|
5 |
+
from langchain import PromptTemplate
|
6 |
+
import re
|
7 |
+
import pandas as pd
|
8 |
+
from langchain.vectorstores import FAISS
|
9 |
+
import requests
|
10 |
+
from typing import List
|
11 |
+
from langchain.schema import (
|
12 |
+
SystemMessage,
|
13 |
+
HumanMessage,
|
14 |
+
AIMessage
|
15 |
+
)
|
16 |
+
import os
|
17 |
+
from langchain.embeddings import HuggingFaceEmbeddings
|
18 |
+
from langchain.chat_models import ChatOpenAI
|
19 |
+
|
20 |
+
from langchain.llms.base import LLM
|
21 |
+
from typing import Optional, List, Mapping, Any
|
22 |
+
|
23 |
+
import ast
|
24 |
+
from utils import ClaudeLLM, extract_website_name, remove_numbers
|
25 |
+
|
26 |
+
embeddings = HuggingFaceEmbeddings()
|
27 |
+
db = FAISS.load_local('db_full', embeddings)
|
28 |
+
|
29 |
+
mp_docs = {}
|
30 |
+
llm = ClaudeLLM()
|
31 |
+
# ChatOpenAI(
|
32 |
+
# temperature=0,
|
33 |
+
# model='gpt-3.5-turbo-16k'
|
34 |
+
# )
|
35 |
+
|
36 |
+
|
37 |
+
def add_text(history, text):
|
38 |
+
|
39 |
+
print(history)
|
40 |
+
history = history + [(text, None)]
|
41 |
+
|
42 |
+
return history, ""
|
43 |
+
|
44 |
+
pipeline = {'claude': (ClaudeLLM(), 0), 'gpt-3.5': (ChatOpenAI(temperature=0,model='gpt-3.5-turbo-16k'), 65), 'gpt-4': (ChatOpenAI(temperature=0, model='gpt-4'), 30)}
|
45 |
+
|
46 |
+
def retrieve_thoughts(query, n):
|
47 |
+
|
48 |
+
# print(db.similarity_search_with_score(query = query, k = k, fetch_k = k*10))
|
49 |
+
docs_with_score = db.similarity_search_with_score(query = query, k = len(db.index_to_docstore_id.values()), fetch_k = len(db.index_to_docstore_id.values()))
|
50 |
+
df = pd.DataFrame([dict(doc[0])['metadata'] for doc in docs_with_score], )
|
51 |
+
df = pd.concat((df, pd.DataFrame([dict(doc[0])['page_content'] for doc in docs_with_score], columns = ['page_content'])), axis = 1)
|
52 |
+
df = pd.concat((df, pd.DataFrame([doc[1] for doc in docs_with_score], columns = ['score'])), axis = 1)
|
53 |
+
|
54 |
+
# TO-DO: What if user query doesn't match what we provide as documents
|
55 |
+
|
56 |
+
tier_1 = df[df['score'] < 0.7]
|
57 |
+
tier_2 = df[(df['score'] < 0.95) * (df["score"] > 0.7)]
|
58 |
+
|
59 |
+
|
60 |
+
chunks_1 = tier_1.groupby(['title', 'url', '_id']).apply(lambda x: "\n...\n".join(x.sort_values('id')['page_content'].values)).values
|
61 |
+
tier_1_adjusted = tier_1.groupby(['title', 'url', '_id']).first().reset_index()[['_id', 'title', 'url']]
|
62 |
+
tier_1_adjusted['ref'] = range(1, len(tier_1_adjusted) + 1 )
|
63 |
+
tier_1_adjusted['content'] = chunks_1
|
64 |
+
|
65 |
+
chunks_2 = tier_2.groupby(['title', 'url', '_id']).apply(lambda x: "\n...\n".join(x.sort_values('id')['page_content'].values)).values
|
66 |
+
tier_2_adjusted = tier_2.groupby(['title', 'url', '_id']).first().reset_index()[['_id', 'title', 'url']]
|
67 |
+
tier_2_adjusted['content'] = chunks_2
|
68 |
+
|
69 |
+
if n:
|
70 |
+
tier_1_adjusted = tier_1_adjusted[:min(len(tier_1_adjusted), n)]
|
71 |
+
|
72 |
+
print(len(tier_1_adjusted))
|
73 |
+
# tier_1 = [doc[0] for doc in docs if ((doc[1] < 1))][:5]
|
74 |
+
# tier_2 = [doc[0] for doc in docs if ((doc[1] > 0.7)*(doc[1] < 1.5))][10:15]
|
75 |
+
|
76 |
+
return {'tier 1':tier_1_adjusted, 'tier 2': tier_2.loc[:5]}
|
77 |
+
|
78 |
+
def qa_retrieve(query, llm):
|
79 |
+
|
80 |
+
llm = pipeline["claude"][0]
|
81 |
+
|
82 |
+
docs = ""
|
83 |
+
|
84 |
+
global db
|
85 |
+
print(db)
|
86 |
+
|
87 |
+
global mp_docs
|
88 |
+
thoughts = retrieve_thoughts(query, 0)
|
89 |
+
if not(thoughts):
|
90 |
+
|
91 |
+
if mp_docs:
|
92 |
+
thoughts = mp_docs
|
93 |
+
else:
|
94 |
+
mp_docs = thoughts
|
95 |
+
|
96 |
+
tier_1 = thoughts['tier 1']
|
97 |
+
tier_2 = thoughts['tier 2']
|
98 |
+
|
99 |
+
reference = tier_1[['ref', 'url', 'title']].to_dict('records')
|
100 |
+
|
101 |
+
tier_1 = list(tier_1.apply(lambda x: f"[{int(x['ref'])}] title: {x['title']}\n Content: {x.content}", axis = 1).values)
|
102 |
+
print(len(tier_1))
|
103 |
+
tier_2 = list(tier_2.apply(lambda x: f"title: {x['title']}\n Content: {x.content}", axis = 1).values)
|
104 |
+
|
105 |
+
print(f"QUERY: {query}\nTIER 1: {tier_1}\nTIER2: {tier_2}")
|
106 |
+
# print(f"DOCS RETRIEVED: {mp_docs.values}")
|
107 |
+
|
108 |
+
# Cynthesis Generation
|
109 |
+
session_prompt = """ A bot that is open to discussions about different cultural, philosophical and political exchanges. You will use do different analysis to the articles provided to me. Stay truthful and if you weren't provided any resources give your oppinion only."""
|
110 |
+
task = """Your primary responsibility is to identify multiple themes from the given articles. For each theme detected, you are to present it under three separate categories:
|
111 |
+
|
112 |
+
1. Theme Title - An easy-to-understand title that encapsulates the core idea of the theme extracted from the article.
|
113 |
+
|
114 |
+
2. Theme Description - An expanded elaboration that explores the theme in detail based on the arguments and points provided in the article.
|
115 |
+
|
116 |
+
3. Quotes related to theme - Locate and provide at least one compelling quote from the article that directly supports or showcases the theme you have identified. This quote should serve as a specific evidence or example from the article text that corresponds directly to the developed theme.
|
117 |
+
|
118 |
+
The extracted themes should be written in structured manner, ensuring clarity and meaningful correlation between the themes and the articles. Make sure your analysis is rooted in the arguments given in the article. Avoid including personal opinions or making generalizations that are not explicitly supported by the articles. """
|
119 |
+
|
120 |
+
|
121 |
+
prompt = PromptTemplate(
|
122 |
+
input_variables=["query", "task", "session_prompt", "articles"],
|
123 |
+
template="""
|
124 |
+
You are a {session_prompt}
|
125 |
+
{task}
|
126 |
+
|
127 |
+
query: {query}
|
128 |
+
|
129 |
+
Articles:
|
130 |
+
{articles}
|
131 |
+
|
132 |
+
|
133 |
+
The extracted themes should be written in structured manner, ensuring clarity and meaningful correlation between the themes and the articles. Make sure your analysis is rooted in the arguments given in the article. Avoid including personal opinions or making generalizations that are not explicitly supported by the articles.
|
134 |
+
|
135 |
+
""",
|
136 |
+
)
|
137 |
+
|
138 |
+
|
139 |
+
# llm = BardLLM()
|
140 |
+
chain = LLMChain(llm=llm, prompt = prompt)
|
141 |
+
|
142 |
+
response = chain.run(query=query, articles="\n".join(tier_1), session_prompt = session_prompt, task = task)
|
143 |
+
|
144 |
+
for i in range(5):
|
145 |
+
response = response.replace(f'[{i}]', f"<span class='text-primary'>[{i}]</span>")
|
146 |
+
|
147 |
+
# Generate related questions
|
148 |
+
prompt_q = PromptTemplate(
|
149 |
+
input_variables=[ "session_prompt", "articles"],
|
150 |
+
template="""
|
151 |
+
You are a {session_prompt}
|
152 |
+
Give general/global questions related the following articles:
|
153 |
+
|
154 |
+
Articles:
|
155 |
+
{articles}
|
156 |
+
|
157 |
+
|
158 |
+
Make sure not to ask specific questions, keep them general, short and concise.
|
159 |
+
""",
|
160 |
+
)
|
161 |
+
|
162 |
+
chain_q = LLMChain(llm=ClaudeLLM(), prompt = prompt_q)
|
163 |
+
|
164 |
+
questions = chain_q.run(session_prompt = session_prompt, articles = "\n".join(tier_2), )
|
165 |
+
print(questions)
|
166 |
+
questions = questions[questions.index('1'):]
|
167 |
+
|
168 |
+
questions = [ remove_numbers(t).strip() for (i, t) in enumerate(questions.split('.')) if len(t) > 5][:5]
|
169 |
+
print(questions)
|
170 |
+
|
171 |
+
# TO-DO: initiate models in another function, refactor code to be reusable
|
172 |
+
|
173 |
+
# json_resp = {'cynthesis': response, 'questions': questions, 'Reference': reference}
|
174 |
+
|
175 |
+
return response, {'Reference': reference}
|
176 |
+
|
177 |
+
def flush():
|
178 |
+
return None
|
179 |
+
|
180 |
+
examples = [
|
181 |
+
["Will Russia win the war in Ukraine?"],
|
182 |
+
|
183 |
+
]
|
184 |
+
|
185 |
+
demo = gr.Interface(fn=qa_retrieve, title="cicero-qa-api",
|
186 |
+
inputs=gr.inputs.Textbox(lines=5, label="what would you like to learn about?"),
|
187 |
+
outputs=[gr.components.Textbox(lines=3, label="Themes"),
|
188 |
+
gr.components.JSON( label="Reference")],examples=examples)
|
189 |
+
|
190 |
+
demo.queue(concurrency_count = 4)
|
191 |
+
demo.launch()
|
192 |
+
|
db_full/index.faiss
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9051c0122a839f58dc047ba2145fd887b64a33ecd746bd17aec950ca044f0653
|
3 |
+
size 354250797
|
db_full/index.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3ccf07d4b39015b8e101152d341883a99d311f22ab7dfc5edba88277041b9179
|
3 |
+
size 102244751
|
requirements.txt
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
pandas
|
2 |
+
langchain
|
3 |
+
openai
|
4 |
+
FAISS-gpu
|
5 |
+
tiktoken
|
6 |
+
transformers
|
7 |
+
sentence_transformers
|
8 |
+
bson
|
9 |
+
anthropic==0.2.10
|
utils.py
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain.llms.base import LLM
|
2 |
+
from typing import Optional, List, Mapping, Any
|
3 |
+
import anthropic
|
4 |
+
from urllib.parse import urlparse
|
5 |
+
import os
|
6 |
+
class ClaudeLLM(LLM):
|
7 |
+
|
8 |
+
@property
|
9 |
+
def _llm_type(self) -> str:
|
10 |
+
|
11 |
+
return "custom"
|
12 |
+
|
13 |
+
def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
|
14 |
+
|
15 |
+
|
16 |
+
client = anthropic.Client(os.environ['ANTHROPIC_KEY'])
|
17 |
+
|
18 |
+
|
19 |
+
# How about the formatted prompt?
|
20 |
+
prompt_formatted = (
|
21 |
+
f"{anthropic.HUMAN_PROMPT}{prompt}\n{anthropic.AI_PROMPT}"
|
22 |
+
)
|
23 |
+
|
24 |
+
|
25 |
+
response = client.completion(
|
26 |
+
prompt=prompt_formatted,
|
27 |
+
stop_sequences=[anthropic.HUMAN_PROMPT],
|
28 |
+
model="claude-instant-v1-100k",
|
29 |
+
max_tokens_to_sample=100000,
|
30 |
+
temperature=0.3,
|
31 |
+
)
|
32 |
+
|
33 |
+
return response["completion"]
|
34 |
+
|
35 |
+
@property
|
36 |
+
def _identifying_params(self) -> Mapping[str, Any]:
|
37 |
+
"""Get the identifying parameters."""
|
38 |
+
return {
|
39 |
+
|
40 |
+
}
|
41 |
+
|
42 |
+
def remove_numbers(question):
|
43 |
+
return question.translate(str.maketrans('', '', '0123456789'))
|
44 |
+
|
45 |
+
def extract_website_name(url):
|
46 |
+
parsed_url = urlparse(url)
|
47 |
+
if parsed_url.netloc.startswith("www."):
|
48 |
+
return parsed_url.netloc.split("www.")[1].split(".")[0]
|
49 |
+
return parsed_url.netloc.split(".")[0]
|