ajv009 jamescalam commited on
Commit
2f4d5d4
0 Parent(s):

Duplicate from pinecone/openai-ml-qa

Browse files

Co-authored-by: James Briggs <jamescalam@users.noreply.huggingface.co>

Files changed (4) hide show
  1. .gitattributes +34 -0
  2. README.md +13 -0
  3. app.py +265 -0
  4. requirements.txt +7 -0
.gitattributes ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: OpenAI ML Q&A
3
+ emoji: 🌖
4
+ colorFrom: yellow
5
+ colorTo: red
6
+ sdk: streamlit
7
+ sdk_version: 1.15.2
8
+ app_file: app.py
9
+ pinned: false
10
+ duplicated_from: pinecone/openai-ml-qa
11
+ ---
12
+
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,265 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pinecone
3
+ import openai
4
+ from openai.embeddings_utils import get_embedding
5
+ import json
6
+
7
+ OPENAI_KEY = st.secrets["OPENAI_KEY"]
8
+ PINECONE_KEY = st.secrets["PINECONE_KEY"]
9
+ INDEX = 'openai-ml-qa'
10
+ instructions = {
11
+ "conservative q&a": "Answer the question based on the context below, and if the question can't be answered based on the context, say \"I don't know\"\n\nContext:\n{0}\n\n---\n\nQuestion: {1}\nAnswer:",
12
+ "paragraph about a question":"Write a paragraph, addressing the question, and use the text below to obtain relevant information\"\n\nContext:\n{0}\n\n---\n\nQuestion: {1}\nParagraph long Answer:",
13
+ "bullet points": "Write a bullet point list of possible answers, addressing the question, and use the text below to obtain relevant information\"\n\nContext:\n{0}\n\n---\n\nQuestion: {1}\nBullet point Answer:",
14
+ "summarize problems given a topic": "Write a summary of the problems addressed by the questions below\"\n\n{0}\n\n---\n\n",
15
+ "extract key libraries and tools": "Write a list of libraries and tools present in the context below\"\n\nContext:\n{0}\n\n---\n\n",
16
+ "simple instructions": "{1} given the common questions and answers below \n\n{0}\n\n---\n\n",
17
+ "summarize": "Write an elaborate, paragraph long summary about \"{1}\" given the questions and answers from a public forum on this topic\n\n{0}\n\n---\n\nSummary:",
18
+ }
19
+
20
+ @st.experimental_singleton(show_spinner=False)
21
+ def init_openai():
22
+ # initialize connection to OpenAI
23
+ openai.api_key = OPENAI_KEY
24
+
25
+ @st.experimental_singleton(show_spinner=False)
26
+ def init_pinecone(index_name):
27
+ # initialize connection to Pinecone vector DB (app.pinecone.io for API key)
28
+ pinecone.init(
29
+ api_key=PINECONE_KEY,
30
+ environment='us-west1-gcp'
31
+ )
32
+ index = pinecone.Index(index_name)
33
+ stats = index.describe_index_stats()
34
+ dims = stats['dimension']
35
+ count = stats['namespaces']['']['vector_count']
36
+ return index, dims, count
37
+
38
+ def create_context(question, index, lib_meta, max_len=3750, top_k=5):
39
+ """
40
+ Find most relevant context for a question via Pinecone search
41
+ """
42
+ q_embed = get_embedding(question, engine=f'text-embedding-ada-002')
43
+ res = index.query(
44
+ q_embed, top_k=top_k,
45
+ include_metadata=True, filter={
46
+ 'docs': {'$in': lib_meta}
47
+ })
48
+
49
+
50
+ cur_len = 0
51
+ contexts = []
52
+ sources = []
53
+
54
+ for row in res['matches']:
55
+ meta = row['metadata']
56
+ text = (
57
+ f"Topic: {meta['thread']}\n"+
58
+ f"Answer: {meta['context']}"
59
+ )
60
+ cur_len += len(text)
61
+ if cur_len < max_len:
62
+ contexts.append(text)
63
+ sources.append(row['metadata'])
64
+ else:
65
+ cur_len -= len(text) + 4
66
+ if max_len - cur_len < 200:
67
+ break
68
+ return "\n\n###\n\n".join(contexts), sources
69
+
70
+ def answer_question(
71
+ index,
72
+ fine_tuned_qa_model="text-davinci-003",
73
+ question="Am I allowed to publish model outputs to Twitter, without a human review?",
74
+ instruction="Answer the question based on the context below, and if the question can't be answered based on the context, say \"I don't know\"\n\nContext:\n{0}\n\n---\n\nQuestion: {1}\nAnswer:",
75
+ max_len=3550,
76
+ size="curie",
77
+ top_k=5,
78
+ debug=False,
79
+ max_tokens=400,
80
+ stop_sequence=None,
81
+ domains=["huggingface", "tensorflow", "streamlit", "pytorch"],
82
+ ):
83
+ """
84
+ Answer a question based on the most similar context from the dataframe texts
85
+ """
86
+ context, sources = create_context(
87
+ question,
88
+ index,
89
+ lib_meta=domains,
90
+ max_len=max_len,
91
+ top_k=top_k
92
+ )
93
+ if debug:
94
+ print("Context:\n" + context)
95
+ print("\n\n")
96
+ try:
97
+ # fine-tuned models requires model parameter, whereas other models require engine parameter
98
+ model_param = (
99
+ {"model": fine_tuned_qa_model}
100
+ if ":" in fine_tuned_qa_model
101
+ and fine_tuned_qa_model.split(":")[1].startswith("ft")
102
+ else {"engine": fine_tuned_qa_model}
103
+ )
104
+ #print(instruction.format(context, question))
105
+ response = openai.Completion.create(
106
+ prompt=instruction.format(context, question),
107
+ temperature=0,
108
+ max_tokens=max_tokens,
109
+ top_p=1,
110
+ frequency_penalty=0,
111
+ presence_penalty=0,
112
+ stop=stop_sequence,
113
+ **model_param,
114
+ )
115
+ return response["choices"][0]["text"].strip(), sources
116
+ except Exception as e:
117
+ print(e)
118
+ return ""
119
+
120
+ def search(index, query, style, top_k, lib_filters):
121
+ if query != "":
122
+ with st.spinner("Retrieving, please wait..."):
123
+ answer, sources = answer_question(
124
+ index,
125
+ question=query,
126
+ instruction=instructions[style],
127
+ top_k=top_k
128
+ )
129
+ # lowercase relevant lib filters
130
+ lib_meta = [lib.lower() for lib in lib_filters.keys() if lib_filters[lib]]
131
+ lower_libs = [lib.lower() for lib in libraries]
132
+ # display the answer
133
+ st.write(answer)
134
+ with st.expander("Sources"):
135
+ for source in sources:
136
+ st.write(f"""
137
+ {source['docs']} > {source['category']} > [{source['thread']}]({source['href']})
138
+ """)
139
+
140
+ st.markdown("""
141
+ <link
142
+ rel="stylesheet"
143
+ href="https://fonts.googleapis.com/css?family=Roboto:300,400,500,700&display=swap"
144
+ />
145
+ """, unsafe_allow_html=True)
146
+
147
+ #model_name = 'mpnet-discourse'
148
+
149
+ libraries = [
150
+ "Streamlit",
151
+ "HuggingFace",
152
+ "PyTorch",
153
+ "TensorFlow"
154
+ ]
155
+
156
+ with st.spinner("Connecting to OpenAI..."):
157
+ retriever = init_openai()
158
+
159
+ with st.spinner("Connecting to Pinecone..."):
160
+ index, dims, count = init_pinecone(INDEX)
161
+
162
+ st.write("# ML Q&A")
163
+ search = st.container()
164
+ query = search.text_input('Ask a framework-specific question!', "")
165
+
166
+ with search.expander("Search Options"):
167
+ style = st.radio(label='Style', options=[
168
+ 'Paragraph about a question', 'Conservative Q&A',
169
+ 'Bullet points', 'Summarize problems given a topic',
170
+ 'Extract key libraries and tools', 'Simple instructions',
171
+ 'Summarize'
172
+ ])
173
+ # add section for filters
174
+ st.write("""
175
+ #### Metadata Filters
176
+
177
+ **Libraries**
178
+ """)
179
+ # create two cols
180
+ cols = st.columns(2)
181
+ # add filtering based on library
182
+ lib_filters = {}
183
+ for lib in libraries:
184
+ i = len(lib_filters.keys()) % 2
185
+ with cols[i]:
186
+ lib_filters[lib] = st.checkbox(lib, value=True)
187
+ st.write("---")
188
+ top_k = st.slider(
189
+ "top_k",
190
+ min_value=1,
191
+ max_value=20,
192
+ value=5
193
+ )
194
+
195
+ st.sidebar.write(f"""
196
+ ### Info
197
+
198
+ **Pinecone index name**: {INDEX}
199
+
200
+ **Pinecone index size**: {count}
201
+
202
+ **OpenAI embedding model**: *text-embedding-ada-002*
203
+
204
+ **Vector dimensionality**: {dims}
205
+
206
+ **OpenAI generation model**: *text-davinci-003*
207
+
208
+ ---
209
+
210
+ ### How it Works
211
+
212
+ The Q&A tool takes discussions and docs from some of the best Python ML
213
+ libraries and collates their content into a natural language search and Q&A tool.
214
+
215
+ Ask questions like **"How do I use the gradient tape in tensorflow?"** or **"What is the difference
216
+ between Tensorflow and PyTorch?"**, choose a answer style, and return relevant results!
217
+
218
+ The app is powered using OpenAI's embedding service with Pinecone's vector database. The whole process consists
219
+ of *three* steps:
220
+
221
+ **1**. Questions are fed into OpenAI's embeddings service to generate a {dims}-dimensional query vector.
222
+
223
+ **2**. We use Pinecone to identify similar context vectors (previously encoded from Q&A pages).
224
+
225
+ **3**. Relevant pages are passed in a new question to OpenAI's generative model, returning our answer.
226
+
227
+ **How do I make something like this?**
228
+
229
+ It's easy! Check out the [source code](https://github.com/pinecone-io/examples/tree/master/integrations/openai/beyond_search_webinar) and learn how to [integrate OpenAI and Pinecone in the docs](https://www.pinecone.io/docs/integrations/openai/)!
230
+
231
+ ---
232
+
233
+ ### Usage
234
+
235
+ If you'd like to restrict your search to a specific library (such as PyTorch or
236
+ Streamlit) you can with the *Advanced Options* dropdown. The source of information
237
+ can be switched between official docs and forum discussions too!
238
+
239
+ If you'd like OpenAI to consider more or less pages, try changing the `top_k` slider.
240
+
241
+ Want to see the original sources that GPT-3 is using to generate the answer? No problem, just click on the **Sources** box.
242
+ """)
243
+
244
+ #if style.lower() == 'conservative q&a':
245
+ # search.info("*Access search options above.*")
246
+
247
+ if search.button("Go!") or query != "":
248
+ with st.spinner("Retrieving, please wait..."):
249
+ # lowercase relevant lib filters
250
+ lib_meta = [lib.lower() for lib in lib_filters.keys() if lib_filters[lib]]
251
+ # ask the question
252
+ answer, sources = answer_question(
253
+ index,
254
+ question=query,
255
+ instruction=instructions[style.lower()],
256
+ top_k=top_k,
257
+ domains=lib_meta
258
+ )
259
+ # display the answer
260
+ st.write(answer)
261
+ with st.expander("Sources"):
262
+ for source in sources:
263
+ st.write(f"""
264
+ {source['docs']} > {source['category']} > [{source['thread']}]({source['href']})
265
+ """)
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ transformers
2
+ sentence-transformers
3
+ pinecone-client
4
+ openai
5
+ plotly
6
+ matplotlib
7
+ click==8.0