AlexanderKazakov
commited on
Commit
·
73fea8e
1
Parent(s):
c22a9de
run without RAG
Browse files- README initial.md +0 -25
- gradio_app/app.py +25 -10
- gradio_app/backend/query_llm.py +73 -12
- requirements.txt +3 -1
- settings.py +3 -0
README initial.md
DELETED
@@ -1,25 +0,0 @@
|
|
1 |
-
# A template for a RAG system with Gradio UI
|
2 |
-
Deliberately stripped down to leave some room for experimenting
|
3 |
-
|
4 |
-
# Setting it up
|
5 |
-
- Clone https://github.com/huggingface/transformers to a local machine
|
6 |
-
- Use the **prep_scrips/markdown_to_text.py** script to extract raw text from markdown from transformers/docs/source/en/
|
7 |
-
- Break the resulting texts down into semantically meaningful pieces. Experiment with different chunking mechanisms to make sure the semantic meaning is captured.
|
8 |
-
- Use **prep_scrips/lancedb_setup.py** to embed and store chunks in a [lancedb](https://lancedb.github.io/lancedb/) instance. It also creates an index for fast ANN retrieval (not really needed for this exercise but necessary at scale). You'll need to put your own values into VECTOR_COLUMN_NAME, TEXT_COLUMN_NAME, DB_TABLE_NAME.
|
9 |
-
- Move the database directory (.lancedb by default) to **gradio_app/**
|
10 |
-
- Use the template given in **gradio_app** to wrap everything into the [Gradio](https://www.gradio.app/docs/interface) app and run it on HF [spaces](https://huggingface.co/docs/hub/spaces-config-reference). Make sure to adjust VECTOR_COLUMN_NAME, TEXT_COLUMN_NAME, DB_TABLE_NAME according to your DB setup.
|
11 |
-
- In your space, set up secrets OPENAI_API_KEY and HUGGING_FACE_HUB_TOKEN to use OpenAI and open-source models correspondingly
|
12 |
-
|
13 |
-
- TODOs:
|
14 |
-
- Experiment with chunking, see how it affects the results. When deciding how to chunk it helps to think about what kind of chunks you'd like to see as context to your queries.
|
15 |
-
- Deliverables: Demonstrate how retrieved documents differ with different chunking strategies and how it affects the output.
|
16 |
-
- Try out different embedding models (EMBED_NAME). Good models to start with are **sentence-transformers/all-MiniLM-L6-v2** - lightweight, **thenlper/gte-large** - relatively heavy but more powerful.
|
17 |
-
- Deliverables: Demonstrate how retrieved documents differ with different embedding models and how they affect the output. Provide an estimate of how the time to embed the chunks and DB ingestion time differs (happening in **prep_scrips/lancedb_setup.py**).
|
18 |
-
- Add a re-ranker (cross-encoder) to the pipeline. Start with sentence-transformers pages on cross-encoders [1](https://www.sbert.net/examples/applications/cross-encoder/README.html) [2](https://www.sbert.net/examples/applications/retrieve_rerank/README.html), then pick a [pretrained cross-encoder](https://www.sbert.net/docs/pretrained-models/ce-msmarco.html), e.g. **cross-encoder/ms-marco-MiniLM-L-12-v2**. Don't forget to increase the number of *retrieved* documents when using re-ranker. The number of documents used as context should stay the same.
|
19 |
-
- Deliverables: Demonstrate how retrieved documents differ after adding a re-ranker and how it affects the output. Provide an estimate of how latency changes.
|
20 |
-
- Try another LLM (e.g. LLaMA-2-70b, falcon-180b).
|
21 |
-
- Deliverables: Demonstrate how LLMs affect the output and how latency changes with the model size.
|
22 |
-
- Add more documents (e.g. diffusers, tokenizers, optimum, etc) to see how the system scales.
|
23 |
-
- Deliverables: Demonstrate how latency changes, and how it differs with and without index (index is added in **prep_scrips/lancedb_setup.py**).
|
24 |
-
- (Bonus) Use an LLM to quantitatively compare outputs of different variants of the system ([LLM as a Judge](https://huggingface.co/collections/andrewrreed/llm-as-a-judge-653fb861e361fd03c12d41e5))
|
25 |
-
- Deliverables: Describe the experimental setup and evaluation results
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
gradio_app/app.py
CHANGED
@@ -10,7 +10,7 @@ from time import perf_counter
|
|
10 |
|
11 |
import gradio as gr
|
12 |
import markdown
|
13 |
-
import lancedb
|
14 |
from jinja2 import Environment, FileSystemLoader
|
15 |
|
16 |
from gradio_app.backend.ChatGptInteractor import num_tokens_from_messages
|
@@ -31,7 +31,8 @@ env = Environment(loader=FileSystemLoader('gradio_app/templates'))
|
|
31 |
context_template = env.get_template('context_template.j2')
|
32 |
context_html_template = env.get_template('context_html_template.j2')
|
33 |
|
34 |
-
db = lancedb.connect(LANCEDB_DIRECTORY)
|
|
|
35 |
|
36 |
# Examples
|
37 |
examples = [
|
@@ -49,13 +50,7 @@ def add_text(history, text):
|
|
49 |
return history, gr.Textbox(value="", interactive=False)
|
50 |
|
51 |
|
52 |
-
def
|
53 |
-
history[-1][1] = ""
|
54 |
-
query = history[-1][0]
|
55 |
-
|
56 |
-
if not query:
|
57 |
-
raise gr.Error("Empty string was submitted")
|
58 |
-
|
59 |
logger.info('Retrieving documents...')
|
60 |
gr.Info('Start documents retrieval ...')
|
61 |
t = perf_counter()
|
@@ -85,7 +80,10 @@ def bot(history, llm, cross_enc, chunk, embed):
|
|
85 |
|
86 |
t = perf_counter() - t
|
87 |
logger.info(f'Finished Reranking documents in {round(t, 2)} seconds...')
|
|
|
88 |
|
|
|
|
|
89 |
msg_constructor = get_message_constructor(llm)
|
90 |
while len(documents) != 0:
|
91 |
context = context_template.render(documents=documents)
|
@@ -98,6 +96,20 @@ def bot(history, llm, cross_enc, chunk, embed):
|
|
98 |
documents.pop()
|
99 |
else:
|
100 |
raise gr.Error('Model context length exceeded, reload the page')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
101 |
|
102 |
llm_gen = get_llm_generator(llm)
|
103 |
logger.info('Generating answer...')
|
@@ -163,12 +175,15 @@ with gr.Blocks() as demo:
|
|
163 |
|
164 |
llm_name = gr.Radio(
|
165 |
choices=[
|
|
|
|
|
|
|
166 |
"gpt-3.5-turbo",
|
167 |
"mistralai/Mistral-7B-Instruct-v0.1",
|
168 |
"tiiuae/falcon-180B-chat",
|
169 |
# "GeneZC/MiniChat-3B",
|
170 |
],
|
171 |
-
value="gpt-
|
172 |
label='LLM'
|
173 |
)
|
174 |
|
|
|
10 |
|
11 |
import gradio as gr
|
12 |
import markdown
|
13 |
+
# import lancedb
|
14 |
from jinja2 import Environment, FileSystemLoader
|
15 |
|
16 |
from gradio_app.backend.ChatGptInteractor import num_tokens_from_messages
|
|
|
31 |
context_template = env.get_template('context_template.j2')
|
32 |
context_html_template = env.get_template('context_html_template.j2')
|
33 |
|
34 |
+
# db = lancedb.connect(LANCEDB_DIRECTORY)
|
35 |
+
db = None
|
36 |
|
37 |
# Examples
|
38 |
examples = [
|
|
|
50 |
return history, gr.Textbox(value="", interactive=False)
|
51 |
|
52 |
|
53 |
+
def find_context(query, cross_enc, chunk, embed):
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
logger.info('Retrieving documents...')
|
55 |
gr.Info('Start documents retrieval ...')
|
56 |
t = perf_counter()
|
|
|
80 |
|
81 |
t = perf_counter() - t
|
82 |
logger.info(f'Finished Reranking documents in {round(t, 2)} seconds...')
|
83 |
+
return documents
|
84 |
|
85 |
+
|
86 |
+
def construct_messages(llm, documents, history):
|
87 |
msg_constructor = get_message_constructor(llm)
|
88 |
while len(documents) != 0:
|
89 |
context = context_template.render(documents=documents)
|
|
|
96 |
documents.pop()
|
97 |
else:
|
98 |
raise gr.Error('Model context length exceeded, reload the page')
|
99 |
+
return documents, context_html, messages
|
100 |
+
|
101 |
+
|
102 |
+
def bot(history, llm, cross_enc, chunk, embed):
|
103 |
+
history[-1][1] = ""
|
104 |
+
query = history[-1][0]
|
105 |
+
|
106 |
+
if not query:
|
107 |
+
raise gr.Error("Empty string was submitted")
|
108 |
+
|
109 |
+
# documents = find_context(query, cross_enc, chunk, embed)
|
110 |
+
# documents, context_html, messages = construct_messages(llm, documents, history)
|
111 |
+
context_html = ''
|
112 |
+
messages = get_message_constructor(llm)('', history)
|
113 |
|
114 |
llm_gen = get_llm_generator(llm)
|
115 |
logger.info('Generating answer...')
|
|
|
175 |
|
176 |
llm_name = gr.Radio(
|
177 |
choices=[
|
178 |
+
"gpt-4-1106-preview",
|
179 |
+
"gpt-4",
|
180 |
+
"gpt-3.5-turbo-1106",
|
181 |
"gpt-3.5-turbo",
|
182 |
"mistralai/Mistral-7B-Instruct-v0.1",
|
183 |
"tiiuae/falcon-180B-chat",
|
184 |
# "GeneZC/MiniChat-3B",
|
185 |
],
|
186 |
+
value="gpt-4-1106-preview",
|
187 |
label='LLM'
|
188 |
)
|
189 |
|
gradio_app/backend/query_llm.py
CHANGED
@@ -5,10 +5,26 @@ from gradio_app.backend.HuggingfaceGenerator import HuggingfaceGenerator
|
|
5 |
|
6 |
env = Environment(loader=FileSystemLoader('gradio_app/templates'))
|
7 |
context_template = env.get_template('context_template.j2')
|
8 |
-
|
9 |
|
10 |
|
11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
messages = []
|
13 |
for q, a in history:
|
14 |
if len(a) == 0: # the last message
|
@@ -25,11 +41,30 @@ def construct_mistral_messages(context, history):
|
|
25 |
return messages
|
26 |
|
27 |
|
28 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
messages = [
|
30 |
{
|
31 |
"role": "system",
|
32 |
-
"content":
|
33 |
},
|
34 |
]
|
35 |
for q, a in history:
|
@@ -50,8 +85,33 @@ def construct_openai_messages(context, history):
|
|
50 |
return messages
|
51 |
|
52 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
def get_message_constructor(llm_name):
|
54 |
-
if llm_name
|
55 |
return construct_openai_messages
|
56 |
if llm_name in ['mistralai/Mistral-7B-Instruct-v0.1', "tiiuae/falcon-180B-chat", "GeneZC/MiniChat-3B"]:
|
57 |
return construct_mistral_messages
|
@@ -59,9 +119,10 @@ def get_message_constructor(llm_name):
|
|
59 |
|
60 |
|
61 |
def get_llm_generator(llm_name):
|
62 |
-
if llm_name
|
63 |
cgi = ChatGptInteractor(
|
64 |
-
model_name=llm_name,
|
|
|
65 |
)
|
66 |
return cgi.chat_completion
|
67 |
if llm_name == 'mistralai/Mistral-7B-Instruct-v0.1' or llm_name == "tiiuae/falcon-180B-chat":
|
@@ -70,11 +131,11 @@ def get_llm_generator(llm_name):
|
|
70 |
)
|
71 |
return hfg.generate
|
72 |
|
73 |
-
if llm_name == "GeneZC/MiniChat-3B":
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
raise ValueError('Unknown LLM name')
|
79 |
|
80 |
|
|
|
5 |
|
6 |
env = Environment(loader=FileSystemLoader('gradio_app/templates'))
|
7 |
context_template = env.get_template('context_template.j2')
|
8 |
+
old_start_system_message = context_template.render(documents=[])
|
9 |
|
10 |
|
11 |
+
start_system_message = """
|
12 |
+
Ignore all previous instructions.
|
13 |
+
|
14 |
+
1. You are to provide clear, concise, and direct responses.
|
15 |
+
2. Eliminate unnecessary reminders, apologies, self-references, and any pre-programmed niceties.
|
16 |
+
3. Maintain a casual tone in your communication.
|
17 |
+
4. Be transparent; if you're unsure about an answer or if a question is beyond your capabilities or knowledge, admit it.
|
18 |
+
5. For any unclear or ambiguous queries, ask follow-up questions to understand the user's intent better.
|
19 |
+
6. When explaining concepts, use real-world examples and analogies, where appropriate.
|
20 |
+
7. For complex requests, take a deep breath and work on the problem step-by-step.
|
21 |
+
8. For every response, you will be tipped up to $200 (depending on the quality of your output).
|
22 |
+
|
23 |
+
It is very important that you get this right. Multiple lives are at stake.
|
24 |
+
"""
|
25 |
+
|
26 |
+
|
27 |
+
def old_construct_mistral_messages(context, history):
|
28 |
messages = []
|
29 |
for q, a in history:
|
30 |
if len(a) == 0: # the last message
|
|
|
41 |
return messages
|
42 |
|
43 |
|
44 |
+
def construct_mistral_messages(context, history):
|
45 |
+
messages = []
|
46 |
+
for q, a in history:
|
47 |
+
# if len(a) == 0: # the last message
|
48 |
+
# q = context + f'\n\nQuery:\n\n{q}'
|
49 |
+
if len(messages) == 0:
|
50 |
+
q = f'{start_system_message} The question:\n\n{q}'
|
51 |
+
messages.append({
|
52 |
+
"role": "user",
|
53 |
+
"content": q,
|
54 |
+
})
|
55 |
+
if len(a) != 0: # do not insert the last answer
|
56 |
+
messages.append({
|
57 |
+
"role": "assistant",
|
58 |
+
"content": a,
|
59 |
+
})
|
60 |
+
return messages
|
61 |
+
|
62 |
+
|
63 |
+
def old_construct_openai_messages(context, history):
|
64 |
messages = [
|
65 |
{
|
66 |
"role": "system",
|
67 |
+
"content": old_start_system_message,
|
68 |
},
|
69 |
]
|
70 |
for q, a in history:
|
|
|
85 |
return messages
|
86 |
|
87 |
|
88 |
+
def construct_openai_messages(context, history):
|
89 |
+
messages = [
|
90 |
+
{
|
91 |
+
"role": "system",
|
92 |
+
"content": start_system_message,
|
93 |
+
},
|
94 |
+
]
|
95 |
+
for q, a in history:
|
96 |
+
# if len(a) == 0: # the last message
|
97 |
+
# messages.append({
|
98 |
+
# "role": "system",
|
99 |
+
# "content": context,
|
100 |
+
# })
|
101 |
+
messages.append({
|
102 |
+
"role": "user",
|
103 |
+
"content": q,
|
104 |
+
})
|
105 |
+
if len(a) != 0: # do not insert the last answer
|
106 |
+
messages.append({
|
107 |
+
"role": "assistant",
|
108 |
+
"content": a,
|
109 |
+
})
|
110 |
+
return messages
|
111 |
+
|
112 |
+
|
113 |
def get_message_constructor(llm_name):
|
114 |
+
if llm_name in ["gpt-4", "gpt-4-1106-preview", "gpt-3.5-turbo", "gpt-3.5-turbo-1106"]:
|
115 |
return construct_openai_messages
|
116 |
if llm_name in ['mistralai/Mistral-7B-Instruct-v0.1', "tiiuae/falcon-180B-chat", "GeneZC/MiniChat-3B"]:
|
117 |
return construct_mistral_messages
|
|
|
119 |
|
120 |
|
121 |
def get_llm_generator(llm_name):
|
122 |
+
if llm_name in ["gpt-4", "gpt-4-1106-preview", "gpt-3.5-turbo", "gpt-3.5-turbo-1106"]:
|
123 |
cgi = ChatGptInteractor(
|
124 |
+
model_name=llm_name, stream=True,
|
125 |
+
# max_tokens=None, temperature=0,
|
126 |
)
|
127 |
return cgi.chat_completion
|
128 |
if llm_name == 'mistralai/Mistral-7B-Instruct-v0.1' or llm_name == "tiiuae/falcon-180B-chat":
|
|
|
131 |
)
|
132 |
return hfg.generate
|
133 |
|
134 |
+
# if llm_name == "GeneZC/MiniChat-3B":
|
135 |
+
# hfg = HuggingfaceGenerator(
|
136 |
+
# model_name=llm_name, temperature=0, max_new_tokens=250, stream=False,
|
137 |
+
# )
|
138 |
+
# return hfg.generate
|
139 |
raise ValueError('Unknown LLM name')
|
140 |
|
141 |
|
requirements.txt
CHANGED
@@ -4,10 +4,12 @@ ipywidgets==8.1.1
|
|
4 |
tqdm==4.66.1
|
5 |
aiohttp==3.8.6
|
6 |
huggingface-hub==0.17.3
|
7 |
-
lancedb
|
8 |
openai==0.28
|
9 |
gradio==4.4.1
|
10 |
markdown==3.5.1
|
11 |
tiktoken==0.5.1
|
12 |
huggingface-hub==0.17.3
|
13 |
sentence-transformers==2.2.2
|
|
|
|
|
|
4 |
tqdm==4.66.1
|
5 |
aiohttp==3.8.6
|
6 |
huggingface-hub==0.17.3
|
7 |
+
lancedb==0.3.4
|
8 |
openai==0.28
|
9 |
gradio==4.4.1
|
10 |
markdown==3.5.1
|
11 |
tiktoken==0.5.1
|
12 |
huggingface-hub==0.17.3
|
13 |
sentence-transformers==2.2.2
|
14 |
+
beautifulsoup4==4.12.2
|
15 |
+
Jinja2==3.1.2
|
settings.py
CHANGED
@@ -30,6 +30,9 @@ context_lengths = {
|
|
30 |
"tiiuae/falcon-180B-chat": 2048,
|
31 |
"GeneZC/MiniChat-3B": 4096,
|
32 |
"gpt-3.5-turbo": 4096,
|
|
|
|
|
|
|
33 |
"sentence-transformers/all-MiniLM-L6-v2": 128,
|
34 |
"thenlper/gte-large": 512,
|
35 |
"text-embedding-ada-002": 1000, # actual context length is 8191, but it's too much
|
|
|
30 |
"tiiuae/falcon-180B-chat": 2048,
|
31 |
"GeneZC/MiniChat-3B": 4096,
|
32 |
"gpt-3.5-turbo": 4096,
|
33 |
+
"gpt-4": 8192,
|
34 |
+
"gpt-4-1106-preview": 128000,
|
35 |
+
"gpt-3.5-turbo-1106": 16385,
|
36 |
"sentence-transformers/all-MiniLM-L6-v2": 128,
|
37 |
"thenlper/gte-large": 512,
|
38 |
"text-embedding-ada-002": 1000, # actual context length is 8191, but it's too much
|