mandrx Abhilashvj commited on
Commit
e7ebc48
0 Parent(s):

Duplicate from Everymans-ai/GPT-knowledge-management

Browse files

Co-authored-by: Abhilash V J <Abhilashvj@users.noreply.huggingface.co>

Files changed (9) hide show
  1. .gitattributes +34 -0
  2. .streamlit/secrets.toml +0 -0
  3. 1.5 +29 -0
  4. README.md +15 -0
  5. app.py +339 -0
  6. packages.txt +2 -0
  7. pinecorn.haystack-pipeline.yml.yml +55 -0
  8. requirements.txt +10 -0
  9. search.py +60 -0
.gitattributes ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
.streamlit/secrets.toml ADDED
File without changes
1.5 ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Requirement already satisfied: tensorboard in /mnt/e/kaggle2022/conda/envs/pt/lib/python3.7/site-packages (2.11.0)
2
+ Requirement already satisfied: google-auth-oauthlib<0.5,>=0.4.1 in /mnt/e/kaggle2022/conda/envs/pt/lib/python3.7/site-packages (from tensorboard) (0.4.6)
3
+ Requirement already satisfied: markdown>=2.6.8 in /mnt/e/kaggle2022/conda/envs/pt/lib/python3.7/site-packages (from tensorboard) (3.4.1)
4
+ Requirement already satisfied: requests<3,>=2.21.0 in /mnt/e/kaggle2022/conda/envs/pt/lib/python3.7/site-packages (from tensorboard) (2.28.1)
5
+ Requirement already satisfied: absl-py>=0.4 in /mnt/e/kaggle2022/conda/envs/pt/lib/python3.7/site-packages (from tensorboard) (1.3.0)
6
+ Requirement already satisfied: setuptools>=41.0.0 in /mnt/e/kaggle2022/conda/envs/pt/lib/python3.7/site-packages (from tensorboard) (65.4.1)
7
+ Requirement already satisfied: tensorboard-data-server<0.7.0,>=0.6.0 in /mnt/e/kaggle2022/conda/envs/pt/lib/python3.7/site-packages (from tensorboard) (0.6.1)
8
+ Requirement already satisfied: wheel>=0.26 in /mnt/e/kaggle2022/conda/envs/pt/lib/python3.7/site-packages (from tensorboard) (0.37.1)
9
+ Requirement already satisfied: google-auth<3,>=1.6.3 in /mnt/e/kaggle2022/conda/envs/pt/lib/python3.7/site-packages (from tensorboard) (2.12.0)
10
+ Requirement already satisfied: protobuf<4,>=3.9.2 in /mnt/e/kaggle2022/conda/envs/pt/lib/python3.7/site-packages (from tensorboard) (3.19.4)
11
+ Requirement already satisfied: tensorboard-plugin-wit>=1.6.0 in /mnt/e/kaggle2022/conda/envs/pt/lib/python3.7/site-packages (from tensorboard) (1.8.1)
12
+ Requirement already satisfied: grpcio>=1.24.3 in /mnt/e/kaggle2022/conda/envs/pt/lib/python3.7/site-packages (from tensorboard) (1.49.1)
13
+ Requirement already satisfied: numpy>=1.12.0 in /mnt/e/kaggle2022/conda/envs/pt/lib/python3.7/site-packages (from tensorboard) (1.21.6)
14
+ Requirement already satisfied: werkzeug>=1.0.1 in /mnt/e/kaggle2022/conda/envs/pt/lib/python3.7/site-packages (from tensorboard) (2.2.2)
15
+ Requirement already satisfied: six>=1.9.0 in /mnt/e/kaggle2022/conda/envs/pt/lib/python3.7/site-packages (from google-auth<3,>=1.6.3->tensorboard) (1.16.0)
16
+ Requirement already satisfied: cachetools<6.0,>=2.0.0 in /mnt/e/kaggle2022/conda/envs/pt/lib/python3.7/site-packages (from google-auth<3,>=1.6.3->tensorboard) (5.2.0)
17
+ Requirement already satisfied: rsa<5,>=3.1.4 in /mnt/e/kaggle2022/conda/envs/pt/lib/python3.7/site-packages (from google-auth<3,>=1.6.3->tensorboard) (4.9)
18
+ Requirement already satisfied: pyasn1-modules>=0.2.1 in /mnt/e/kaggle2022/conda/envs/pt/lib/python3.7/site-packages (from google-auth<3,>=1.6.3->tensorboard) (0.2.8)
19
+ Requirement already satisfied: requests-oauthlib>=0.7.0 in /mnt/e/kaggle2022/conda/envs/pt/lib/python3.7/site-packages (from google-auth-oauthlib<0.5,>=0.4.1->tensorboard) (1.3.1)
20
+ Requirement already satisfied: importlib-metadata>=4.4 in /mnt/e/kaggle2022/conda/envs/pt/lib/python3.7/site-packages (from markdown>=2.6.8->tensorboard) (5.0.0)
21
+ Requirement already satisfied: urllib3<1.27,>=1.21.1 in /mnt/e/kaggle2022/conda/envs/pt/lib/python3.7/site-packages (from requests<3,>=2.21.0->tensorboard) (1.26.12)
22
+ Requirement already satisfied: charset-normalizer<3,>=2 in /mnt/e/kaggle2022/conda/envs/pt/lib/python3.7/site-packages (from requests<3,>=2.21.0->tensorboard) (2.1.1)
23
+ Requirement already satisfied: idna<4,>=2.5 in /mnt/e/kaggle2022/conda/envs/pt/lib/python3.7/site-packages (from requests<3,>=2.21.0->tensorboard) (3.4)
24
+ Requirement already satisfied: certifi>=2017.4.17 in /mnt/e/kaggle2022/conda/envs/pt/lib/python3.7/site-packages (from requests<3,>=2.21.0->tensorboard) (2022.12.7)
25
+ Requirement already satisfied: MarkupSafe>=2.1.1 in /mnt/e/kaggle2022/conda/envs/pt/lib/python3.7/site-packages (from werkzeug>=1.0.1->tensorboard) (2.1.1)
26
+ Requirement already satisfied: zipp>=0.5 in /mnt/e/kaggle2022/conda/envs/pt/lib/python3.7/site-packages (from importlib-metadata>=4.4->markdown>=2.6.8->tensorboard) (3.11.0)
27
+ Requirement already satisfied: typing-extensions>=3.6.4 in /mnt/e/kaggle2022/conda/envs/pt/lib/python3.7/site-packages (from importlib-metadata>=4.4->markdown>=2.6.8->tensorboard) (4.4.0)
28
+ Requirement already satisfied: pyasn1<0.5.0,>=0.4.6 in /mnt/e/kaggle2022/conda/envs/pt/lib/python3.7/site-packages (from pyasn1-modules>=0.2.1->google-auth<3,>=1.6.3->tensorboard) (0.4.8)
29
+ Requirement already satisfied: oauthlib>=3.0.0 in /mnt/e/kaggle2022/conda/envs/pt/lib/python3.7/site-packages (from requests-oauthlib>=0.7.0->google-auth-oauthlib<0.5,>=0.4.1->tensorboard) (3.2.1)
README.md ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Haystack QA
3
+ emoji: 📚
4
+ colorFrom: yellow
5
+ colorTo: green
6
+ sdk: streamlit
7
+ sdk_version: 1.15.2
8
+ app_file: app.py
9
+ pinned: false
10
+ license: apache-2.0
11
+ duplicated_from: Everymans-ai/GPT-knowledge-management
12
+ ---
13
+
14
+
15
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,339 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datetime
2
+ import json
3
+ import logging
4
+ import os
5
+ import shutil
6
+ import sys
7
+ import uuid
8
+ from json import JSONDecodeError
9
+ from pathlib import Path
10
+ from time import sleep
11
+
12
+ import openai
13
+ import pandas as pd
14
+ import pinecone
15
+ import streamlit as st
16
+ from annotated_text import annotation
17
+ from haystack import Document
18
+ from haystack.document_stores import PineconeDocumentStore
19
+ from haystack.nodes import (
20
+ DocxToTextConverter,
21
+ EmbeddingRetriever,
22
+ FARMReader,
23
+ FileTypeClassifier,
24
+ PDFToTextConverter,
25
+ PreProcessor,
26
+ TextConverter,
27
+ )
28
+ from haystack.pipelines import ExtractiveQAPipeline, Pipeline
29
+ from markdown import markdown
30
+ from sentence_transformers import SentenceTransformer
31
+ from tqdm.auto import tqdm
32
+
33
+ # get API key from top-right dropdown on OpenAI website
34
+ openai.api_key = st.secrets["OPENAI_API_KEY"]
35
+ index_name = "openai-ada-002-index"
36
+
37
+
38
+ # connect to pinecone environment
39
+ pinecone.init(api_key=st.secrets["pinecone_apikey"], environment="us-east1-gcp")
40
+
41
+ embed_model = "text-embedding-ada-002"
42
+ preprocessor = PreProcessor(
43
+ clean_empty_lines=True,
44
+ clean_whitespace=True,
45
+ clean_header_footer=False,
46
+ split_by="word",
47
+ split_length=200,
48
+ split_respect_sentence_boundary=True,
49
+ )
50
+ file_type_classifier = FileTypeClassifier()
51
+ text_converter = TextConverter()
52
+ pdf_converter = PDFToTextConverter()
53
+ docx_converter = DocxToTextConverter()
54
+
55
+ # check if the abstractive-question-answering index exists
56
+ if index_name not in pinecone.list_indexes():
57
+ # delete the current index and create the new index if it does not exist
58
+ for delete_index in pinecone.list_indexes():
59
+ pinecone.delete_index(delete_index)
60
+ pinecone.create_index(index_name, dimension=1536, metric="cosine")
61
+
62
+ # connect to abstractive-question-answering index we created
63
+ index = pinecone.Index(index_name)
64
+
65
+ FILE_UPLOAD_PATH = "./data/uploads/"
66
+ os.makedirs(FILE_UPLOAD_PATH, exist_ok=True)
67
+
68
+ limit = 3750
69
+
70
+
71
+ def retrieve(query):
72
+ res = openai.Embedding.create(input=[query], engine=embed_model)
73
+
74
+ # retrieve from Pinecone
75
+ xq = res["data"][0]["embedding"]
76
+
77
+ # get relevant contexts
78
+ res = index.query(xq, top_k=3, include_metadata=True)
79
+ contexts = [x["metadata"].get("text", "") for x in res["matches"]]
80
+
81
+ # build our prompt with the retrieved contexts included
82
+ prompt_start = "Answer the question based on the context below.\n\n" + "Context:\n"
83
+ prompt_end = f"\n\nQuestion: {query}\nAnswer:"
84
+ # append contexts until hitting limit
85
+ for i in range(1, len(contexts)):
86
+ if len("\n\n---\n\n".join(contexts[:i])) >= limit:
87
+ prompt = prompt_start + "\n\n---\n\n".join(contexts[: i - 1]) + prompt_end
88
+ break
89
+ elif i == len(contexts) - 1:
90
+ prompt = prompt_start + "\n\n---\n\n".join(contexts) + prompt_end
91
+ return prompt, contexts
92
+
93
+
94
+ # first let's make it simpler to get answers
95
+ def complete(prompt):
96
+ # query text-davinci-003
97
+ res = openai.Completion.create(
98
+ engine="text-davinci-003",
99
+ prompt=prompt,
100
+ temperature=0,
101
+ max_tokens=400,
102
+ top_p=1,
103
+ frequency_penalty=0,
104
+ presence_penalty=0,
105
+ stop=None,
106
+ )
107
+ return res["choices"][0]["text"].strip()
108
+
109
+
110
+ def query(question, top_k_reader, top_k_retriever):
111
+ # first we retrieve relevant items from Pinecone
112
+ query_with_contexts, contexts = retrieve(question)
113
+ return complete(query_with_contexts), contexts
114
+
115
+
116
+ indexing_pipeline_with_classification = Pipeline()
117
+ indexing_pipeline_with_classification.add_node(
118
+ component=file_type_classifier, name="FileTypeClassifier", inputs=["File"]
119
+ )
120
+ indexing_pipeline_with_classification.add_node(
121
+ component=text_converter, name="TextConverter", inputs=["FileTypeClassifier.output_1"]
122
+ )
123
+ indexing_pipeline_with_classification.add_node(
124
+ component=pdf_converter, name="PdfConverter", inputs=["FileTypeClassifier.output_2"]
125
+ )
126
+ indexing_pipeline_with_classification.add_node(
127
+ component=docx_converter, name="DocxConverter", inputs=["FileTypeClassifier.output_4"]
128
+ )
129
+ indexing_pipeline_with_classification.add_node(
130
+ component=preprocessor,
131
+ name="Preprocessor",
132
+ inputs=["TextConverter", "PdfConverter", "DocxConverter"],
133
+ )
134
+
135
+
136
+ def set_state_if_absent(key, value):
137
+ if key not in st.session_state:
138
+ st.session_state[key] = value
139
+
140
+
141
+ # Adjust to a question that you would like users to see in the search bar when they load the UI:
142
+ DEFAULT_QUESTION_AT_STARTUP = os.getenv(
143
+ "DEFAULT_QUESTION_AT_STARTUP", "My blog post discusses remote work. Give me statistics."
144
+ )
145
+ DEFAULT_ANSWER_AT_STARTUP = os.getenv(
146
+ "DEFAULT_ANSWER_AT_STARTUP",
147
+ "7% more remote workers have been at their current organization for 5 years or fewer",
148
+ )
149
+
150
+ # Sliders
151
+ DEFAULT_DOCS_FROM_RETRIEVER = int(os.getenv("DEFAULT_DOCS_FROM_RETRIEVER", "3"))
152
+ DEFAULT_NUMBER_OF_ANSWERS = int(os.getenv("DEFAULT_NUMBER_OF_ANSWERS", "3"))
153
+
154
+
155
+ st.set_page_config(
156
+ page_title="GPT3 and Langchain Demo"
157
+ )
158
+
159
+ # Persistent state
160
+ set_state_if_absent("question", DEFAULT_QUESTION_AT_STARTUP)
161
+ set_state_if_absent("answer", DEFAULT_ANSWER_AT_STARTUP)
162
+ set_state_if_absent("results", None)
163
+
164
+
165
+ # Small callback to reset the interface in case the text of the question changes
166
+ def reset_results(*args):
167
+ st.session_state.answer = None
168
+ st.session_state.results = None
169
+ st.session_state.raw_json = None
170
+
171
+
172
+ # Title
173
+ st.write("# GPT3 and Langchain Demo")
174
+ st.markdown(
175
+ """
176
+ This demo takes its data from the documents uploaded to the Pinecone index through this app. \n
177
+ Ask any question from the uploaded documents and Pinecone will retrieve the context for answers and GPT3 will answer them using the retrieved context. \n
178
+ *Note: do not use keywords, but full-fledged questions.* The demo is not optimized to deal with keyword queries and might misunderstand you.
179
+ """,
180
+ unsafe_allow_html=True,
181
+ )
182
+
183
+ # Sidebar
184
+ st.sidebar.header("Options")
185
+ st.sidebar.write("## File Upload:")
186
+ data_files = st.sidebar.file_uploader(
187
+ "upload", type=["pdf", "txt", "docx"], accept_multiple_files=True, label_visibility="hidden"
188
+ )
189
+ ALL_FILES = []
190
+ META_DATA = []
191
+ for data_file in data_files:
192
+ # Upload file
193
+ if data_file:
194
+ file_path = Path(FILE_UPLOAD_PATH) / f"{uuid.uuid4().hex}_{data_file.name}"
195
+ with open(file_path, "wb") as f:
196
+ f.write(data_file.getbuffer())
197
+ ALL_FILES.append(file_path)
198
+ st.sidebar.write(str(data_file.name) + " &nbsp;&nbsp; ✅ ")
199
+ META_DATA.append({"filename": data_file.name})
200
+
201
+
202
+ if len(ALL_FILES) > 0:
203
+ # document_store.update_embeddings(retriever, update_existing_embeddings=False)
204
+ docs = indexing_pipeline_with_classification.run(file_paths=ALL_FILES, meta=META_DATA)[
205
+ "documents"
206
+ ]
207
+ index_name = "qa_demo"
208
+ # we will use batches of 64
209
+ batch_size = 100
210
+ # docs = docs['documents']
211
+ with st.spinner("🧠 &nbsp;&nbsp; Performing indexing of uplaoded documents... \n "):
212
+ for i in range(0, len(docs), batch_size):
213
+ # find end of batch
214
+ i_end = min(i + batch_size, len(docs))
215
+ # extract batch
216
+ batch = [doc.content for doc in docs[i:i_end]]
217
+ # generate embeddings for batch
218
+ try:
219
+ res = openai.Embedding.create(input=batch, engine=embed_model)
220
+ except Exception as e:
221
+ done = False
222
+ count = 0
223
+ while not done and count < 5:
224
+ sleep(5)
225
+ try:
226
+ res = openai.Embedding.create(input=batch, engine=embed_model)
227
+ done = True
228
+ except:
229
+ count += 1
230
+
231
+ pass
232
+ if count >= 5:
233
+ res = []
234
+ st.error(f"🐞 File indexing failed{str(e)}")
235
+
236
+ if len(res) > 0:
237
+ embeds = [record["embedding"] for record in res["data"]]
238
+ # get metadata
239
+ meta = []
240
+ for doc in docs[i:i_end]:
241
+ meta_dict = doc.meta
242
+ meta_dict["text"] = doc.content
243
+ meta.append(meta_dict)
244
+ # create unique IDs
245
+ ids = [doc.id for doc in docs[i:i_end]]
246
+ # add all to upsert list
247
+ to_upsert = list(zip(ids, embeds, meta))
248
+ # upsert/insert these records to pinecone
249
+ _ = index.upsert(vectors=to_upsert)
250
+
251
+ # top_k_reader = st.sidebar.slider(
252
+ # "Max. number of answers",
253
+ # min_value=1,
254
+ # max_value=10,
255
+ # value=DEFAULT_NUMBER_OF_ANSWERS,
256
+ # step=1,
257
+ # on_change=reset_results,
258
+ # )
259
+ # top_k_retriever = st.sidebar.slider(
260
+ # "Max. number of documents from retriever",
261
+ # min_value=1,
262
+ # max_value=10,
263
+ # value=DEFAULT_DOCS_FROM_RETRIEVER,
264
+ # step=1,
265
+ # on_change=reset_results,
266
+ # )
267
+ # data_files = st.file_uploader(
268
+ # "upload", type=["csv"], accept_multiple_files=True, label_visibility="hidden"
269
+ # )
270
+ # for data_file in data_files:
271
+ # # Upload file
272
+ # if data_file:
273
+ # raw_json = upload_doc(data_file)
274
+
275
+ question = st.text_input(
276
+ value=st.session_state.question,
277
+ max_chars=100,
278
+ on_change=reset_results,
279
+ label="question",
280
+ label_visibility="hidden",
281
+ )
282
+ col1, col2 = st.columns(2)
283
+ col1.markdown("<style>.stButton button {width:100%;}</style>", unsafe_allow_html=True)
284
+ col2.markdown("<style>.stButton button {width:100%;}</style>", unsafe_allow_html=True)
285
+
286
+ # Run button
287
+ run_pressed = col1.button("Run")
288
+ if run_pressed:
289
+
290
+ run_query = run_pressed or question != st.session_state.question
291
+ # Get results for query
292
+ if run_query and question:
293
+ reset_results()
294
+ st.session_state.question = question
295
+
296
+ with st.spinner("🧠 &nbsp;&nbsp; Performing neural search on documents... \n "):
297
+ try:
298
+ st.session_state.results = query(question, top_k_reader=None, top_k_retriever=None)
299
+ except JSONDecodeError as je:
300
+ st.error(
301
+ "👓 &nbsp;&nbsp; An error occurred reading the results. Is the document store working?"
302
+ )
303
+ except Exception as e:
304
+ logging.exception(e)
305
+ if "The server is busy processing requests" in str(e) or "503" in str(e):
306
+ st.error("🧑‍🌾 &nbsp;&nbsp; All our workers are busy! Try again later.")
307
+ else:
308
+ st.error(f"🐞 &nbsp;&nbsp; An error occurred during the request. {str(e)}")
309
+
310
+
311
+ if st.session_state.results:
312
+
313
+ st.write("## Results:")
314
+
315
+ result, contexts = st.session_state.results
316
+ # answer, context = result.answer, result.context
317
+ # start_idx = context.find(answer)
318
+ # end_idx = start_idx + len(answer)
319
+ # Hack due to this bug: https://github.com/streamlit/streamlit/issues/3190
320
+ try:
321
+ # source = f"[{result.meta['Title']}]({result.meta['link']})"
322
+ # st.write(
323
+ # markdown(f'**Source:** {source} \n {context[:start_idx] } {str(annotation(answer, "ANSWER", "#8ef"))} {context[end_idx:]} \n '),
324
+ # unsafe_allow_html=True,
325
+ # )
326
+ all_contexts = '\n'.join(contexts)
327
+ st.write(markdown(f"Answer: \n {result} \n"),
328
+ unsafe_allow_html=True,
329
+ )
330
+ except:
331
+ # filename = result.meta.get('filename', "")
332
+ # st.write(
333
+ # markdown(f'From file: {filename} \n {context[:start_idx] } {str(annotation(answer, "ANSWER", "#8ef"))} {context[end_idx:]} \n '),
334
+ # unsafe_allow_html=True,
335
+ # )
336
+ st.write(
337
+ markdown(f"Answer: {result}"),
338
+ unsafe_allow_html=True,
339
+ )
packages.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ poppler-utils
2
+ xpdf
pinecorn.haystack-pipeline.yml.yml ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # To allow your IDE to autocomplete and validate your YAML pipelines, name them as <name of your choice>.haystack-pipeline.yml
2
+
3
+ version: ignore
4
+
5
+ components: # define all the building-blocks for Pipeline
6
+ - name: DocumentStore
7
+ type: ElasticsearchDocumentStore
8
+ params:
9
+ index=: qa_demo
10
+ similarity: cosine
11
+ embedding_dim: 768
12
+ - name: Retriever
13
+ type: BM25Retriever
14
+ params:
15
+ document_store: DocumentStore # params can reference other components defined in the YAML
16
+ top_k: 5
17
+ - name: Reader # custom-name for the component; helpful for visualization & debugging
18
+ type: FARMReader # Haystack Class name for the component
19
+ params:
20
+ model_name_or_path: deepset/roberta-base-squad2
21
+ context_window_size: 500
22
+ return_no_answer: true
23
+ - name: TextFileConverter
24
+ type: TextConverter
25
+ - name: PDFFileConverter
26
+ type: PDFToTextConverter
27
+ - name: Preprocessor
28
+ type: PreProcessor
29
+ params:
30
+ split_by: word
31
+ split_length: 1000
32
+ - name: FileTypeClassifier
33
+ type: FileTypeClassifier
34
+
35
+ pipelines:
36
+ - name: query # a sample extractive-qa Pipeline
37
+ nodes:
38
+ - name: Retriever
39
+ inputs: [Query]
40
+ - name: Reader
41
+ inputs: [Retriever]
42
+ - name: indexing
43
+ nodes:
44
+ - name: FileTypeClassifier
45
+ inputs: [File]
46
+ - name: TextFileConverter
47
+ inputs: [FileTypeClassifier.output_1]
48
+ - name: PDFFileConverter
49
+ inputs: [FileTypeClassifier.output_2]
50
+ - name: Preprocessor
51
+ inputs: [PDFFileConverter, TextFileConverter]
52
+ - name: Retriever
53
+ inputs: [Preprocessor]
54
+ - name: DocumentStore
55
+ inputs: [Retriever]
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ protobuf==3.19
2
+ streamlit==1.13
3
+ st-annotated-text
4
+ farm-haystack[pinecone]
5
+ farm-haystack[ocr]
6
+ pinecone-client
7
+ datasets
8
+ tensorboard
9
+ openai
10
+ langchain
search.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ import pinecone
4
+ index_name = "abstractive-question-answering"
5
+
6
+ # check if the abstractive-question-answering index exists
7
+ if index_name not in pinecone.list_indexes():
8
+ # create the index if it does not exist
9
+ pinecone.create_index(
10
+ index_name,
11
+ dimension=768,
12
+ metric="cosine"
13
+ )
14
+
15
+ # connect to abstractive-question-answering index we created
16
+ index = pinecone.Index(index_name)
17
+
18
+ # we will use batches of 64
19
+ batch_size = 64
20
+
21
+ for i in tqdm(range(0, len(df), batch_size)):
22
+ # find end of batch
23
+ i_end = min(i+batch_size, len(df))
24
+ # extract batch
25
+ batch = df.iloc[i:i_end]
26
+ # generate embeddings for batch
27
+ emb = retriever.encode(batch["passage_text"].tolist()).tolist()
28
+ # get metadata
29
+ meta = batch.to_dict(orient="records")
30
+ # create unique IDs
31
+ ids = [f"{idx}" for idx in range(i, i_end)]
32
+ # add all to upsert list
33
+ to_upsert = list(zip(ids, emb, meta))
34
+ # upsert/insert these records to pinecone
35
+ _ = index.upsert(vectors=to_upsert)
36
+
37
+ # check that we have all vectors in index
38
+ index.describe_index_stats()
39
+
40
+ # from transformers import BartTokenizer, BartForConditionalGeneration
41
+
42
+ # # load bart tokenizer and model from huggingface
43
+ # tokenizer = BartTokenizer.from_pretrained('vblagoje/bart_lfqa')
44
+ # generator = BartForConditionalGeneration.from_pretrained('vblagoje/bart_lfqa')
45
+
46
+ # def query_pinecone(query, top_k):
47
+ # # generate embeddings for the query
48
+ # xq = retriever.encode([query]).tolist()
49
+ # # search pinecone index for context passage with the answer
50
+ # xc = index.query(xq, top_k=top_k, include_metadata=True)
51
+ # return xc
52
+
53
+ # def format_query(query, context):
54
+ # # extract passage_text from Pinecone search result and add the tag
55
+ # context = [f" {m['metadata']['passage_text']}" for m in context]
56
+ # # concatinate all context passages
57
+ # context = " ".join(context)
58
+ # # contcatinate the query and context passages
59
+ # query = f"question: {query} context: {context}"
60
+ # return query