Abhilashvj commited on
Commit
2b9b8a4
1 Parent(s): fabd612

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +120 -45
app.py CHANGED
@@ -26,6 +26,10 @@ from haystack.pipelines import ExtractiveQAPipeline, Pipeline
26
  from markdown import markdown
27
  from sentence_transformers import SentenceTransformer
28
 
 
 
 
 
29
  index_name = "qa_demo"
30
 
31
 
@@ -36,6 +40,7 @@ pinecone.init(
36
  )
37
  index_name = "qa-demo"
38
 
 
39
  preprocessor = PreProcessor(
40
  clean_empty_lines=True,
41
  clean_whitespace=True,
@@ -54,7 +59,7 @@ if index_name not in pinecone.list_indexes():
54
  # create the index if it does not exist
55
  pinecone.create_index(
56
  index_name,
57
- dimension=768,
58
  metric="cosine"
59
  )
60
 
@@ -69,7 +74,10 @@ def create_doc_store():
69
  api_key= st.secrets["pinecone_apikey"],
70
  index=index_name,
71
  similarity="cosine",
72
- embedding_dim=768
 
 
 
73
  )
74
  return document_store
75
 
@@ -83,20 +91,68 @@ def create_doc_store():
83
  # reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=False)
84
  # pipe = ExtractiveQAPipeline(reader, retriever)
85
  # return pipe
 
86
 
87
- def query(pipe, question, top_k_reader, top_k_retriever):
88
- res = pipe.run(
89
- query=question, params={"Retriever": {"top_k": top_k_retriever}, "Reader": {"top_k": top_k_reader}}
 
90
  )
91
- answer_df = []
92
- # for r in res['answers']:
93
- # ans_dict = res['answers'][0].meta
94
- # ans_dict["answer"] = r.context
95
- # answer_df.append(ans_dict)
96
- # result = pd.DataFrame(answer_df)
97
- # result.columns = ["Source","Title","Year","Link","Answer"]
98
- # result[["Answer","Link","Source","Title","Year"]]
99
- return res
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
 
101
  document_store = create_doc_store()
102
  # pipe = create_pipe(document_store)
@@ -107,11 +163,11 @@ embedding_model=retriever_model,
107
  model_format="sentence_transformers",
108
  )
109
  # load the retriever model from huggingface model hub
110
- sentence_encoder = SentenceTransformer(retriever_model)
111
-
112
- reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=False)
113
- pipe = ExtractiveQAPipeline(reader, retriever)
114
 
 
 
 
115
 
116
  indexing_pipeline_with_classification = Pipeline()
117
  indexing_pipeline_with_classification.add_node(
@@ -205,7 +261,18 @@ if len(ALL_FILES) > 0:
205
  # extract batch
206
  batch = [doc.content for doc in docs[i:i_end]]
207
  # generate embeddings for batch
208
- emb = sentence_encoder.encode(batch).tolist()
 
 
 
 
 
 
 
 
 
 
 
209
  # get metadata
210
  meta = [doc.meta for doc in docs[i:i_end]]
211
  # create unique IDs
@@ -215,22 +282,22 @@ if len(ALL_FILES) > 0:
215
  # upsert/insert these records to pinecone
216
  _ = index.upsert(vectors=to_upsert)
217
 
218
- top_k_reader = st.sidebar.slider(
219
- "Max. number of answers",
220
- min_value=1,
221
- max_value=10,
222
- value=DEFAULT_NUMBER_OF_ANSWERS,
223
- step=1,
224
- on_change=reset_results,
225
- )
226
- top_k_retriever = st.sidebar.slider(
227
- "Max. number of documents from retriever",
228
- min_value=1,
229
- max_value=10,
230
- value=DEFAULT_DOCS_FROM_RETRIEVER,
231
- step=1,
232
- on_change=reset_results,
233
- )
234
  # data_files = st.file_uploader(
235
  # "upload", type=["csv"], accept_multiple_files=True, label_visibility="hidden"
236
  # )
@@ -267,7 +334,7 @@ if run_pressed:
267
  ):
268
  try:
269
  st.session_state.results = query(
270
- pipe, question, top_k_reader=top_k_reader, top_k_retriever=top_k_retriever
271
  )
272
  except JSONDecodeError as je:
273
  st.error("👓    An error occurred reading the results. Is the document store working?")
@@ -283,21 +350,29 @@ if st.session_state.results:
283
 
284
  st.write("## Results:")
285
 
286
- for count, result in enumerate(st.session_state.results['answers']):
287
- answer, context = result.answer, result.context
288
- start_idx = context.find(answer)
289
- end_idx = start_idx + len(answer)
290
  # Hack due to this bug: https://github.com/streamlit/streamlit/issues/3190
291
  try:
292
- source = f"[{result.meta['Title']}]({result.meta['link']})"
 
 
 
 
293
  st.write(
294
- markdown(f'**Source:** {source} \n {context[:start_idx] } {str(annotation(answer, "ANSWER", "#8ef"))} {context[end_idx:]} \n '),
295
  unsafe_allow_html=True,
296
- )
297
  except:
298
- filename = result.meta.get('filename', "")
 
 
 
 
299
  st.write(
300
- markdown(f'From file: {filename} \n {context[:start_idx] } {str(annotation(answer, "ANSWER", "#8ef"))} {context[end_idx:]} \n '),
301
  unsafe_allow_html=True,
302
  )
303
 
 
26
  from markdown import markdown
27
  from sentence_transformers import SentenceTransformer
28
 
29
+ import openai
30
+
31
+ # get API key from top-right dropdown on OpenAI website
32
+ openai.api_key = st.secrets["OPENAI_API_KEY"]
33
  index_name = "qa_demo"
34
 
35
 
 
40
  )
41
  index_name = "qa-demo"
42
 
43
+ embed_model = "text-embedding-ada-002"
44
  preprocessor = PreProcessor(
45
  clean_empty_lines=True,
46
  clean_whitespace=True,
 
59
  # create the index if it does not exist
60
  pinecone.create_index(
61
  index_name,
62
+ dimension=1536,
63
  metric="cosine"
64
  )
65
 
 
74
  api_key= st.secrets["pinecone_apikey"],
75
  index=index_name,
76
  similarity="cosine",
77
+ embedding_dim=768,
78
+ metadata_config={
79
+ 'indexed': ['filename']
80
+ }
81
  )
82
  return document_store
83
 
 
91
  # reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=False)
92
  # pipe = ExtractiveQAPipeline(reader, retriever)
93
  # return pipe
94
+ limit = 3750
95
 
96
+ def retrieve(query):
97
+ res = openai.Embedding.create(
98
+ input=[query],
99
+ engine=embed_model
100
  )
101
+
102
+ # retrieve from Pinecone
103
+ xq = res['data'][0]['embedding']
104
+
105
+ # get relevant contexts
106
+ res = index.query(xq, top_k=3, include_metadata=True)
107
+ contexts = [
108
+ x['metadata']['text'] for x in res['matches']
109
+ ]
110
+
111
+ # build our prompt with the retrieved contexts included
112
+ prompt_start = (
113
+ "Answer the question based on the context below.\n\n"+
114
+ "Context:\n"
115
+ )
116
+ prompt_end = (
117
+ f"\n\nQuestion: {query}\nAnswer:"
118
+ )
119
+ # append contexts until hitting limit
120
+ for i in range(1, len(contexts)):
121
+ if len("\n\n---\n\n".join(contexts[:i])) >= limit:
122
+ prompt = (
123
+ prompt_start +
124
+ "\n\n---\n\n".join(contexts[:i-1]) +
125
+ prompt_end
126
+ )
127
+ break
128
+ elif i == len(contexts)-1:
129
+ prompt = (
130
+ prompt_start +
131
+ "\n\n---\n\n".join(contexts) +
132
+ prompt_end
133
+ )
134
+ return prompt, contexts
135
+
136
+
137
+ # first let's make it simpler to get answers
138
+ def complete(prompt):
139
+ # query text-davinci-003
140
+ res = openai.Completion.create(
141
+ engine='text-davinci-003',
142
+ prompt=prompt,
143
+ temperature=0,
144
+ max_tokens=400,
145
+ top_p=1,
146
+ frequency_penalty=0,
147
+ presence_penalty=0,
148
+ stop=None
149
+ )
150
+ return res['choices'][0]['text'].strip()
151
+
152
+ def query(pipe, question, top_k_reader, top_k_retriever):
153
+ # first we retrieve relevant items from Pinecone
154
+ query_with_contexts, contexts = retrieve(question)
155
+ return complete(query_with_contexts), contexts
156
 
157
  document_store = create_doc_store()
158
  # pipe = create_pipe(document_store)
 
163
  model_format="sentence_transformers",
164
  )
165
  # load the retriever model from huggingface model hub
166
+ # sentence_encoder = SentenceTransformer(retriever_model)
 
 
 
167
 
168
+ # reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=False)
169
+ # pipe = ExtractiveQAPipeline(reader, retriever)
170
+ # now query text-davinci-003 WITHOUT context
171
 
172
  indexing_pipeline_with_classification = Pipeline()
173
  indexing_pipeline_with_classification.add_node(
 
261
  # extract batch
262
  batch = [doc.content for doc in docs[i:i_end]]
263
  # generate embeddings for batch
264
+ try:
265
+ res = openai.Embedding.create(input=texts, engine=embed_model)
266
+ except:
267
+ done = False
268
+ while not done:
269
+ sleep(5)
270
+ try:
271
+ res = openai.Embedding.create(input=texts, engine=embed_model)
272
+ done = True
273
+ except:
274
+ pass
275
+ embeds = [record['embedding'] for record in res['data']]
276
  # get metadata
277
  meta = [doc.meta for doc in docs[i:i_end]]
278
  # create unique IDs
 
282
  # upsert/insert these records to pinecone
283
  _ = index.upsert(vectors=to_upsert)
284
 
285
+ # top_k_reader = st.sidebar.slider(
286
+ # "Max. number of answers",
287
+ # min_value=1,
288
+ # max_value=10,
289
+ # value=DEFAULT_NUMBER_OF_ANSWERS,
290
+ # step=1,
291
+ # on_change=reset_results,
292
+ # )
293
+ # top_k_retriever = st.sidebar.slider(
294
+ # "Max. number of documents from retriever",
295
+ # min_value=1,
296
+ # max_value=10,
297
+ # value=DEFAULT_DOCS_FROM_RETRIEVER,
298
+ # step=1,
299
+ # on_change=reset_results,
300
+ # )
301
  # data_files = st.file_uploader(
302
  # "upload", type=["csv"], accept_multiple_files=True, label_visibility="hidden"
303
  # )
 
334
  ):
335
  try:
336
  st.session_state.results = query(
337
+ pipe, question, top_k_reader=None, top_k_retriever=None
338
  )
339
  except JSONDecodeError as je:
340
  st.error("👓    An error occurred reading the results. Is the document store working?")
 
350
 
351
  st.write("## Results:")
352
 
353
+ for result,contexts in st.session_state.results:
354
+ # answer, context = result.answer, result.context
355
+ # start_idx = context.find(answer)
356
+ # end_idx = start_idx + len(answer)
357
  # Hack due to this bug: https://github.com/streamlit/streamlit/issues/3190
358
  try:
359
+ # source = f"[{result.meta['Title']}]({result.meta['link']})"
360
+ # st.write(
361
+ # markdown(f'**Source:** {source} \n {context[:start_idx] } {str(annotation(answer, "ANSWER", "#8ef"))} {context[end_idx:]} \n '),
362
+ # unsafe_allow_html=True,
363
+ # )
364
  st.write(
365
+ markdown(f"Answer: {result} \n Extracted from context {contexts}"),
366
  unsafe_allow_html=True,
367
+ )
368
  except:
369
+ # filename = result.meta.get('filename', "")
370
+ # st.write(
371
+ # markdown(f'From file: {filename} \n {context[:start_idx] } {str(annotation(answer, "ANSWER", "#8ef"))} {context[end_idx:]} \n '),
372
+ # unsafe_allow_html=True,
373
+ # )
374
  st.write(
375
+ markdown(f"Answer: {result}"),
376
  unsafe_allow_html=True,
377
  )
378