Abhilashvj commited on
Commit
ab8dd8d
0 Parent(s):

Duplicate from Abhilashvj/haystack_QA

Browse files
Files changed (9) hide show
  1. .gitattributes +34 -0
  2. .streamlit/secrets.toml +0 -0
  3. 1.5 +29 -0
  4. README.md +15 -0
  5. app.py +305 -0
  6. packages.txt +2 -0
  7. pinecorn.haystack-pipeline.yml.yml +55 -0
  8. requirements.txt +8 -0
  9. search.py +60 -0
.gitattributes ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
.streamlit/secrets.toml ADDED
File without changes
1.5 ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Requirement already satisfied: tensorboard in /mnt/e/kaggle2022/conda/envs/pt/lib/python3.7/site-packages (2.11.0)
2
+ Requirement already satisfied: google-auth-oauthlib<0.5,>=0.4.1 in /mnt/e/kaggle2022/conda/envs/pt/lib/python3.7/site-packages (from tensorboard) (0.4.6)
3
+ Requirement already satisfied: markdown>=2.6.8 in /mnt/e/kaggle2022/conda/envs/pt/lib/python3.7/site-packages (from tensorboard) (3.4.1)
4
+ Requirement already satisfied: requests<3,>=2.21.0 in /mnt/e/kaggle2022/conda/envs/pt/lib/python3.7/site-packages (from tensorboard) (2.28.1)
5
+ Requirement already satisfied: absl-py>=0.4 in /mnt/e/kaggle2022/conda/envs/pt/lib/python3.7/site-packages (from tensorboard) (1.3.0)
6
+ Requirement already satisfied: setuptools>=41.0.0 in /mnt/e/kaggle2022/conda/envs/pt/lib/python3.7/site-packages (from tensorboard) (65.4.1)
7
+ Requirement already satisfied: tensorboard-data-server<0.7.0,>=0.6.0 in /mnt/e/kaggle2022/conda/envs/pt/lib/python3.7/site-packages (from tensorboard) (0.6.1)
8
+ Requirement already satisfied: wheel>=0.26 in /mnt/e/kaggle2022/conda/envs/pt/lib/python3.7/site-packages (from tensorboard) (0.37.1)
9
+ Requirement already satisfied: google-auth<3,>=1.6.3 in /mnt/e/kaggle2022/conda/envs/pt/lib/python3.7/site-packages (from tensorboard) (2.12.0)
10
+ Requirement already satisfied: protobuf<4,>=3.9.2 in /mnt/e/kaggle2022/conda/envs/pt/lib/python3.7/site-packages (from tensorboard) (3.19.4)
11
+ Requirement already satisfied: tensorboard-plugin-wit>=1.6.0 in /mnt/e/kaggle2022/conda/envs/pt/lib/python3.7/site-packages (from tensorboard) (1.8.1)
12
+ Requirement already satisfied: grpcio>=1.24.3 in /mnt/e/kaggle2022/conda/envs/pt/lib/python3.7/site-packages (from tensorboard) (1.49.1)
13
+ Requirement already satisfied: numpy>=1.12.0 in /mnt/e/kaggle2022/conda/envs/pt/lib/python3.7/site-packages (from tensorboard) (1.21.6)
14
+ Requirement already satisfied: werkzeug>=1.0.1 in /mnt/e/kaggle2022/conda/envs/pt/lib/python3.7/site-packages (from tensorboard) (2.2.2)
15
+ Requirement already satisfied: six>=1.9.0 in /mnt/e/kaggle2022/conda/envs/pt/lib/python3.7/site-packages (from google-auth<3,>=1.6.3->tensorboard) (1.16.0)
16
+ Requirement already satisfied: cachetools<6.0,>=2.0.0 in /mnt/e/kaggle2022/conda/envs/pt/lib/python3.7/site-packages (from google-auth<3,>=1.6.3->tensorboard) (5.2.0)
17
+ Requirement already satisfied: rsa<5,>=3.1.4 in /mnt/e/kaggle2022/conda/envs/pt/lib/python3.7/site-packages (from google-auth<3,>=1.6.3->tensorboard) (4.9)
18
+ Requirement already satisfied: pyasn1-modules>=0.2.1 in /mnt/e/kaggle2022/conda/envs/pt/lib/python3.7/site-packages (from google-auth<3,>=1.6.3->tensorboard) (0.2.8)
19
+ Requirement already satisfied: requests-oauthlib>=0.7.0 in /mnt/e/kaggle2022/conda/envs/pt/lib/python3.7/site-packages (from google-auth-oauthlib<0.5,>=0.4.1->tensorboard) (1.3.1)
20
+ Requirement already satisfied: importlib-metadata>=4.4 in /mnt/e/kaggle2022/conda/envs/pt/lib/python3.7/site-packages (from markdown>=2.6.8->tensorboard) (5.0.0)
21
+ Requirement already satisfied: urllib3<1.27,>=1.21.1 in /mnt/e/kaggle2022/conda/envs/pt/lib/python3.7/site-packages (from requests<3,>=2.21.0->tensorboard) (1.26.12)
22
+ Requirement already satisfied: charset-normalizer<3,>=2 in /mnt/e/kaggle2022/conda/envs/pt/lib/python3.7/site-packages (from requests<3,>=2.21.0->tensorboard) (2.1.1)
23
+ Requirement already satisfied: idna<4,>=2.5 in /mnt/e/kaggle2022/conda/envs/pt/lib/python3.7/site-packages (from requests<3,>=2.21.0->tensorboard) (3.4)
24
+ Requirement already satisfied: certifi>=2017.4.17 in /mnt/e/kaggle2022/conda/envs/pt/lib/python3.7/site-packages (from requests<3,>=2.21.0->tensorboard) (2022.12.7)
25
+ Requirement already satisfied: MarkupSafe>=2.1.1 in /mnt/e/kaggle2022/conda/envs/pt/lib/python3.7/site-packages (from werkzeug>=1.0.1->tensorboard) (2.1.1)
26
+ Requirement already satisfied: zipp>=0.5 in /mnt/e/kaggle2022/conda/envs/pt/lib/python3.7/site-packages (from importlib-metadata>=4.4->markdown>=2.6.8->tensorboard) (3.11.0)
27
+ Requirement already satisfied: typing-extensions>=3.6.4 in /mnt/e/kaggle2022/conda/envs/pt/lib/python3.7/site-packages (from importlib-metadata>=4.4->markdown>=2.6.8->tensorboard) (4.4.0)
28
+ Requirement already satisfied: pyasn1<0.5.0,>=0.4.6 in /mnt/e/kaggle2022/conda/envs/pt/lib/python3.7/site-packages (from pyasn1-modules>=0.2.1->google-auth<3,>=1.6.3->tensorboard) (0.4.8)
29
+ Requirement already satisfied: oauthlib>=3.0.0 in /mnt/e/kaggle2022/conda/envs/pt/lib/python3.7/site-packages (from requests-oauthlib>=0.7.0->google-auth-oauthlib<0.5,>=0.4.1->tensorboard) (3.2.1)
README.md ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Haystack QA
3
+ emoji: 📚
4
+ colorFrom: yellow
5
+ colorTo: green
6
+ sdk: streamlit
7
+ sdk_version: 1.15.2
8
+ app_file: app.py
9
+ pinned: false
10
+ license: apache-2.0
11
+ duplicated_from: Abhilashvj/haystack_QA
12
+ ---
13
+
14
+
15
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,305 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import logging
3
+ import os
4
+ import shutil
5
+ import sys
6
+ import uuid
7
+ from json import JSONDecodeError
8
+ from pathlib import Path
9
+
10
+ import pandas as pd
11
+ import pinecone
12
+ import streamlit as st
13
+ from annotated_text import annotation
14
+ from haystack import Document
15
+ from haystack.document_stores import PineconeDocumentStore
16
+ from haystack.nodes import (
17
+ DocxToTextConverter,
18
+ EmbeddingRetriever,
19
+ FARMReader,
20
+ FileTypeClassifier,
21
+ PDFToTextConverter,
22
+ PreProcessor,
23
+ TextConverter,
24
+ )
25
+ from haystack.pipelines import ExtractiveQAPipeline, Pipeline
26
+ from markdown import markdown
27
+ from sentence_transformers import SentenceTransformer
28
+
29
+ index_name = "qa_demo"
30
+
31
+
32
+ # connect to pinecone environment
33
+ pinecone.init(
34
+ api_key=st.secrets["pinecone_apikey"],
35
+ # environment="us-west1-gcp"
36
+ )
37
+ index_name = "qa-demo"
38
+
39
+ preprocessor = PreProcessor(
40
+ clean_empty_lines=True,
41
+ clean_whitespace=True,
42
+ clean_header_footer=False,
43
+ split_by="word",
44
+ split_length=100,
45
+ split_respect_sentence_boundary=True
46
+ )
47
+ file_type_classifier = FileTypeClassifier()
48
+ text_converter = TextConverter()
49
+ pdf_converter = PDFToTextConverter()
50
+ docx_converter = DocxToTextConverter()
51
+
52
+ # check if the abstractive-question-answering index exists
53
+ if index_name not in pinecone.list_indexes():
54
+ # create the index if it does not exist
55
+ pinecone.create_index(
56
+ index_name,
57
+ dimension=768,
58
+ metric="cosine"
59
+ )
60
+
61
+ # connect to abstractive-question-answering index we created
62
+ index = pinecone.Index(index_name)
63
+
64
+ FILE_UPLOAD_PATH= "./data/uploads/"
65
+ os.makedirs(FILE_UPLOAD_PATH, exist_ok=True)
66
+ # @st.cache
67
+ def create_doc_store():
68
+ document_store = PineconeDocumentStore(
69
+ api_key= st.secrets["pinecone_apikey"],
70
+ index=index_name,
71
+ similarity="cosine",
72
+ embedding_dim=768
73
+ )
74
+ return document_store
75
+
76
+ # @st.cache
77
+ # def create_pipe(document_store):
78
+ # retriever = EmbeddingRetriever(
79
+ # document_store=document_store,
80
+ # embedding_model="sentence-transformers/multi-qa-mpnet-base-dot-v1",
81
+ # model_format="sentence_transformers",
82
+ # )
83
+ # reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=False)
84
+ # pipe = ExtractiveQAPipeline(reader, retriever)
85
+ # return pipe
86
+
87
+ def query(pipe, question, top_k_reader, top_k_retriever):
88
+ res = pipe.run(
89
+ query=question, params={"Retriever": {"top_k": top_k_retriever}, "Reader": {"top_k": top_k_reader}}
90
+ )
91
+ answer_df = []
92
+ # for r in res['answers']:
93
+ # ans_dict = res['answers'][0].meta
94
+ # ans_dict["answer"] = r.context
95
+ # answer_df.append(ans_dict)
96
+ # result = pd.DataFrame(answer_df)
97
+ # result.columns = ["Source","Title","Year","Link","Answer"]
98
+ # result[["Answer","Link","Source","Title","Year"]]
99
+ return res
100
+
101
+ document_store = create_doc_store()
102
+ # pipe = create_pipe(document_store)
103
+ retriever_model = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
104
+ retriever = EmbeddingRetriever(
105
+ document_store=document_store,
106
+ embedding_model=retriever_model,
107
+ model_format="sentence_transformers",
108
+ )
109
+ # load the retriever model from huggingface model hub
110
+ sentence_encoder = SentenceTransformer(retriever_model)
111
+
112
+ reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=False)
113
+ pipe = ExtractiveQAPipeline(reader, retriever)
114
+
115
+
116
+ indexing_pipeline_with_classification = Pipeline()
117
+ indexing_pipeline_with_classification.add_node(
118
+ component=file_type_classifier, name="FileTypeClassifier", inputs=["File"]
119
+ )
120
+ indexing_pipeline_with_classification.add_node(
121
+ component=text_converter, name="TextConverter", inputs=["FileTypeClassifier.output_1"]
122
+ )
123
+ indexing_pipeline_with_classification.add_node(
124
+ component=pdf_converter, name="PdfConverter", inputs=["FileTypeClassifier.output_2"]
125
+ )
126
+ indexing_pipeline_with_classification.add_node(
127
+ component=docx_converter, name="DocxConverter", inputs=["FileTypeClassifier.output_4"]
128
+ )
129
+ indexing_pipeline_with_classification.add_node(
130
+ component=preprocessor,
131
+ name="Preprocessor",
132
+ inputs=["TextConverter", "PdfConverter", "DocxConverter"],
133
+ )
134
+
135
+ def set_state_if_absent(key, value):
136
+ if key not in st.session_state:
137
+ st.session_state[key] = value
138
+
139
+ # Adjust to a question that you would like users to see in the search bar when they load the UI:
140
+ DEFAULT_QUESTION_AT_STARTUP = os.getenv("DEFAULT_QUESTION_AT_STARTUP", "My blog post discusses remote work. Give me statistics.")
141
+ DEFAULT_ANSWER_AT_STARTUP = os.getenv("DEFAULT_ANSWER_AT_STARTUP", "7% more remote workers have been at their current organization for 5 years or fewer")
142
+
143
+ # Sliders
144
+ DEFAULT_DOCS_FROM_RETRIEVER = int(os.getenv("DEFAULT_DOCS_FROM_RETRIEVER", "3"))
145
+ DEFAULT_NUMBER_OF_ANSWERS = int(os.getenv("DEFAULT_NUMBER_OF_ANSWERS", "3"))
146
+
147
+
148
+ st.set_page_config(page_title="Haystack Demo", page_icon="https://haystack.deepset.ai/img/HaystackIcon.png")
149
+
150
+ # Persistent state
151
+ set_state_if_absent("question", DEFAULT_QUESTION_AT_STARTUP)
152
+ set_state_if_absent("answer", DEFAULT_ANSWER_AT_STARTUP)
153
+ set_state_if_absent("results", None)
154
+
155
+
156
+ # Small callback to reset the interface in case the text of the question changes
157
+ def reset_results(*args):
158
+ st.session_state.answer = None
159
+ st.session_state.results = None
160
+ st.session_state.raw_json = None
161
+
162
+ # Title
163
+ st.write("# Haystack Search Demo")
164
+ st.markdown(
165
+ """
166
+ This demo takes its data from two sample data csv with statistics on various topics. \n
167
+ Ask any question on this topic and see if Haystack can find the correct answer to your query! \n
168
+ *Note: do not use keywords, but full-fledged questions.* The demo is not optimized to deal with keyword queries and might misunderstand you.
169
+ """,
170
+ unsafe_allow_html=True,
171
+ )
172
+
173
+ # Sidebar
174
+ st.sidebar.header("Options")
175
+ st.sidebar.write("## File Upload:")
176
+ data_files = st.sidebar.file_uploader(
177
+ "upload", type=["pdf", "txt", "docx"], accept_multiple_files=True, label_visibility="hidden"
178
+ )
179
+ ALL_FILES = []
180
+ META_DATA = []
181
+ for data_file in data_files:
182
+ # Upload file
183
+ if data_file:
184
+ file_path = Path(FILE_UPLOAD_PATH) / f"{uuid.uuid4().hex}_{data_file.name}"
185
+ with open(file_path, "wb") as f:
186
+ f.write(data_file.getbuffer())
187
+ ALL_FILES.append(file_path)
188
+ st.sidebar.write(str(data_file.name) + " &nbsp;&nbsp; ✅ ")
189
+ META_DATA.append({"filename":data_file.name})
190
+
191
+
192
+ if len(ALL_FILES) > 0:
193
+ # document_store.update_embeddings(retriever, update_existing_embeddings=False)
194
+ docs = indexing_pipeline_with_classification.run(file_paths=ALL_FILES, meta=META_DATA)["documents"]
195
+ index_name = "qa_demo"
196
+ # we will use batches of 64
197
+ batch_size = 64
198
+ # docs = docs['documents']
199
+ with st.spinner(
200
+ "🧠 &nbsp;&nbsp; Performing indexing of uplaoded documents... \n "
201
+ ):
202
+ for i in range(0, len(docs), batch_size):
203
+ # find end of batch
204
+ i_end = min(i+batch_size, len(docs))
205
+ # extract batch
206
+ batch = [doc.content for doc in docs[i:i_end]]
207
+ # generate embeddings for batch
208
+ emb = sentence_encoder.encode(batch).tolist()
209
+ # get metadata
210
+ meta = [doc.meta for doc in docs[i:i_end]]
211
+ # create unique IDs
212
+ ids = [doc.id for doc in docs[i:i_end]]
213
+ # add all to upsert list
214
+ to_upsert = list(zip(ids, emb, meta))
215
+ # upsert/insert these records to pinecone
216
+ _ = index.upsert(vectors=to_upsert)
217
+
218
+ top_k_reader = st.sidebar.slider(
219
+ "Max. number of answers",
220
+ min_value=1,
221
+ max_value=10,
222
+ value=DEFAULT_NUMBER_OF_ANSWERS,
223
+ step=1,
224
+ on_change=reset_results,
225
+ )
226
+ top_k_retriever = st.sidebar.slider(
227
+ "Max. number of documents from retriever",
228
+ min_value=1,
229
+ max_value=10,
230
+ value=DEFAULT_DOCS_FROM_RETRIEVER,
231
+ step=1,
232
+ on_change=reset_results,
233
+ )
234
+ # data_files = st.file_uploader(
235
+ # "upload", type=["csv"], accept_multiple_files=True, label_visibility="hidden"
236
+ # )
237
+ # for data_file in data_files:
238
+ # # Upload file
239
+ # if data_file:
240
+ # raw_json = upload_doc(data_file)
241
+
242
+ question = st.text_input(
243
+ value=st.session_state.question,
244
+ max_chars=100,
245
+ on_change=reset_results,
246
+ label="question",
247
+ label_visibility="hidden",
248
+ )
249
+ col1, col2 = st.columns(2)
250
+ col1.markdown("<style>.stButton button {width:100%;}</style>", unsafe_allow_html=True)
251
+ col2.markdown("<style>.stButton button {width:100%;}</style>", unsafe_allow_html=True)
252
+
253
+ # Run button
254
+ run_pressed = col1.button("Run")
255
+ if run_pressed:
256
+
257
+ run_query = (
258
+ run_pressed or question != st.session_state.question
259
+ )
260
+ # Get results for query
261
+ if run_query and question:
262
+ reset_results()
263
+ st.session_state.question = question
264
+
265
+ with st.spinner(
266
+ "🧠 &nbsp;&nbsp; Performing neural search on documents... \n "
267
+ ):
268
+ try:
269
+ st.session_state.results = query(
270
+ pipe, question, top_k_reader=top_k_reader, top_k_retriever=top_k_retriever
271
+ )
272
+ except JSONDecodeError as je:
273
+ st.error("👓 &nbsp;&nbsp; An error occurred reading the results. Is the document store working?")
274
+ except Exception as e:
275
+ logging.exception(e)
276
+ if "The server is busy processing requests" in str(e) or "503" in str(e):
277
+ st.error("🧑‍🌾 &nbsp;&nbsp; All our workers are busy! Try again later.")
278
+ else:
279
+ st.error(f"🐞 &nbsp;&nbsp; An error occurred during the request. {str(e)}")
280
+
281
+
282
+ if st.session_state.results:
283
+
284
+ st.write("## Results:")
285
+
286
+ for count, result in enumerate(st.session_state.results['answers']):
287
+ answer, context = result.answer, result.context
288
+ start_idx = context.find(answer)
289
+ end_idx = start_idx + len(answer)
290
+ # Hack due to this bug: https://github.com/streamlit/streamlit/issues/3190
291
+ try:
292
+ source = f"[{result.meta['Title']}]({result.meta['link']})"
293
+ st.write(
294
+ markdown(f'**Source:** {source} \n {context[:start_idx] } {str(annotation(answer, "ANSWER", "#8ef"))} {context[end_idx:]} \n '),
295
+ unsafe_allow_html=True,
296
+ )
297
+ except:
298
+ filename = result.meta.get('filename', "")
299
+ st.write(
300
+ markdown(f'From file: {filename} \n {context[:start_idx] } {str(annotation(answer, "ANSWER", "#8ef"))} {context[end_idx:]} \n '),
301
+ unsafe_allow_html=True,
302
+ )
303
+
304
+
305
+
packages.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ poppler-utils
2
+ xpdf
pinecorn.haystack-pipeline.yml.yml ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # To allow your IDE to autocomplete and validate your YAML pipelines, name them as <name of your choice>.haystack-pipeline.yml
2
+
3
+ version: ignore
4
+
5
+ components: # define all the building-blocks for Pipeline
6
+ - name: DocumentStore
7
+ type: ElasticsearchDocumentStore
8
+ params:
9
+ index=: qa_demo
10
+ similarity: cosine
11
+ embedding_dim: 768
12
+ - name: Retriever
13
+ type: BM25Retriever
14
+ params:
15
+ document_store: DocumentStore # params can reference other components defined in the YAML
16
+ top_k: 5
17
+ - name: Reader # custom-name for the component; helpful for visualization & debugging
18
+ type: FARMReader # Haystack Class name for the component
19
+ params:
20
+ model_name_or_path: deepset/roberta-base-squad2
21
+ context_window_size: 500
22
+ return_no_answer: true
23
+ - name: TextFileConverter
24
+ type: TextConverter
25
+ - name: PDFFileConverter
26
+ type: PDFToTextConverter
27
+ - name: Preprocessor
28
+ type: PreProcessor
29
+ params:
30
+ split_by: word
31
+ split_length: 1000
32
+ - name: FileTypeClassifier
33
+ type: FileTypeClassifier
34
+
35
+ pipelines:
36
+ - name: query # a sample extractive-qa Pipeline
37
+ nodes:
38
+ - name: Retriever
39
+ inputs: [Query]
40
+ - name: Reader
41
+ inputs: [Retriever]
42
+ - name: indexing
43
+ nodes:
44
+ - name: FileTypeClassifier
45
+ inputs: [File]
46
+ - name: TextFileConverter
47
+ inputs: [FileTypeClassifier.output_1]
48
+ - name: PDFFileConverter
49
+ inputs: [FileTypeClassifier.output_2]
50
+ - name: Preprocessor
51
+ inputs: [PDFFileConverter, TextFileConverter]
52
+ - name: Retriever
53
+ inputs: [Preprocessor]
54
+ - name: DocumentStore
55
+ inputs: [Retriever]
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ protobuf==3.19
2
+ streamlit==1.13
3
+ st-annotated-text
4
+ farm-haystack[pinecone]
5
+ farm-haystack[ocr]
6
+ pinecone-client
7
+ datasets
8
+ tensorboard
search.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ import pinecone
4
+ index_name = "abstractive-question-answering"
5
+
6
+ # check if the abstractive-question-answering index exists
7
+ if index_name not in pinecone.list_indexes():
8
+ # create the index if it does not exist
9
+ pinecone.create_index(
10
+ index_name,
11
+ dimension=768,
12
+ metric="cosine"
13
+ )
14
+
15
+ # connect to abstractive-question-answering index we created
16
+ index = pinecone.Index(index_name)
17
+
18
+ # we will use batches of 64
19
+ batch_size = 64
20
+
21
+ for i in tqdm(range(0, len(df), batch_size)):
22
+ # find end of batch
23
+ i_end = min(i+batch_size, len(df))
24
+ # extract batch
25
+ batch = df.iloc[i:i_end]
26
+ # generate embeddings for batch
27
+ emb = retriever.encode(batch["passage_text"].tolist()).tolist()
28
+ # get metadata
29
+ meta = batch.to_dict(orient="records")
30
+ # create unique IDs
31
+ ids = [f"{idx}" for idx in range(i, i_end)]
32
+ # add all to upsert list
33
+ to_upsert = list(zip(ids, emb, meta))
34
+ # upsert/insert these records to pinecone
35
+ _ = index.upsert(vectors=to_upsert)
36
+
37
+ # check that we have all vectors in index
38
+ index.describe_index_stats()
39
+
40
+ # from transformers import BartTokenizer, BartForConditionalGeneration
41
+
42
+ # # load bart tokenizer and model from huggingface
43
+ # tokenizer = BartTokenizer.from_pretrained('vblagoje/bart_lfqa')
44
+ # generator = BartForConditionalGeneration.from_pretrained('vblagoje/bart_lfqa')
45
+
46
+ # def query_pinecone(query, top_k):
47
+ # # generate embeddings for the query
48
+ # xq = retriever.encode([query]).tolist()
49
+ # # search pinecone index for context passage with the answer
50
+ # xc = index.query(xq, top_k=top_k, include_metadata=True)
51
+ # return xc
52
+
53
+ # def format_query(query, context):
54
+ # # extract passage_text from Pinecone search result and add the tag
55
+ # context = [f" {m['metadata']['passage_text']}" for m in context]
56
+ # # concatinate all context passages
57
+ # context = " ".join(context)
58
+ # # contcatinate the query and context passages
59
+ # query = f"question: {query} context: {context}"
60
+ # return query