Spaces:
Running
Running
refactoring grobid client generic
Browse files- document_qa/document_qa_engine.py +14 -10
- document_qa/grobid_processors.py +10 -1
- client.py → document_qa/ner_client_generic.py +237 -1
- grobid_client_generic.py +0 -264
- streamlit_app.py +1 -1
document_qa/document_qa_engine.py
CHANGED
@@ -85,6 +85,9 @@ class TextMerger:
|
|
85 |
|
86 |
return new_passages_struct
|
87 |
|
|
|
|
|
|
|
88 |
|
89 |
class DocumentQAEngine:
|
90 |
llm = None
|
@@ -123,16 +126,7 @@ class DocumentQAEngine:
|
|
123 |
self.load_embeddings(self.embeddings_root_path)
|
124 |
|
125 |
if grobid_url:
|
126 |
-
self.
|
127 |
-
grobid_client = GrobidClient(
|
128 |
-
grobid_server=self.grobid_url,
|
129 |
-
batch_size=1000,
|
130 |
-
coordinates=["p", "title", "persName"],
|
131 |
-
sleep_time=5,
|
132 |
-
timeout=60,
|
133 |
-
check_server=True
|
134 |
-
)
|
135 |
-
self.grobid_processor = GrobidProcessor(grobid_client)
|
136 |
|
137 |
def load_embeddings(self, embeddings_root_path: Union[str, Path]) -> None:
|
138 |
"""
|
@@ -204,6 +198,16 @@ class DocumentQAEngine:
|
|
204 |
context_as_text = [doc.page_content for doc in documents]
|
205 |
return context_as_text
|
206 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
207 |
def _parse_json(self, response, output_parser):
|
208 |
system_message = "You are an useful assistant expert in materials science, physics, and chemistry " \
|
209 |
"that can process text and transform it to JSON."
|
|
|
85 |
|
86 |
return new_passages_struct
|
87 |
|
88 |
+
class DataStorage:
|
89 |
+
|
90 |
+
|
91 |
|
92 |
class DocumentQAEngine:
|
93 |
llm = None
|
|
|
126 |
self.load_embeddings(self.embeddings_root_path)
|
127 |
|
128 |
if grobid_url:
|
129 |
+
self.grobid_processor = GrobidProcessor(grobid_url)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
130 |
|
131 |
def load_embeddings(self, embeddings_root_path: Union[str, Path]) -> None:
|
132 |
"""
|
|
|
198 |
context_as_text = [doc.page_content for doc in documents]
|
199 |
return context_as_text
|
200 |
|
201 |
+
def query_storage_and_embeddings(self, query: str, doc_id, context_size=4):
|
202 |
+
db = self.embeddings_dict[doc_id]
|
203 |
+
retriever = db.as_retriever(search_kwargs={"k": context_size})
|
204 |
+
relevant_documents = retriever.get_relevant_documents(query, include=["embeddings"])
|
205 |
+
|
206 |
+
context_as_text = [doc.page_content for doc in relevant_documents]
|
207 |
+
return context_as_text
|
208 |
+
|
209 |
+
# chroma_collection.get(include=['embeddings'])['embeddings']
|
210 |
+
|
211 |
def _parse_json(self, response, output_parser):
|
212 |
system_message = "You are an useful assistant expert in materials science, physics, and chemistry " \
|
213 |
"that can process text and transform it to JSON."
|
document_qa/grobid_processors.py
CHANGED
@@ -6,6 +6,7 @@ from pathlib import Path
|
|
6 |
import dateparser
|
7 |
import grobid_tei_xml
|
8 |
from bs4 import BeautifulSoup
|
|
|
9 |
from tqdm import tqdm
|
10 |
|
11 |
|
@@ -127,8 +128,16 @@ class BaseProcessor(object):
|
|
127 |
|
128 |
|
129 |
class GrobidProcessor(BaseProcessor):
|
130 |
-
def __init__(self,
|
131 |
# super().__init__()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
132 |
self.grobid_client = grobid_client
|
133 |
|
134 |
def process_structure(self, input_path, coordinates=False):
|
|
|
6 |
import dateparser
|
7 |
import grobid_tei_xml
|
8 |
from bs4 import BeautifulSoup
|
9 |
+
from grobid_client.grobid_client import GrobidClient
|
10 |
from tqdm import tqdm
|
11 |
|
12 |
|
|
|
128 |
|
129 |
|
130 |
class GrobidProcessor(BaseProcessor):
|
131 |
+
def __init__(self, grobid_url, ping_server=True):
|
132 |
# super().__init__()
|
133 |
+
grobid_client = GrobidClient(
|
134 |
+
grobid_server=grobid_url,
|
135 |
+
batch_size=5,
|
136 |
+
coordinates=["p", "title", "persName"],
|
137 |
+
sleep_time=5,
|
138 |
+
timeout=60,
|
139 |
+
check_server=ping_server
|
140 |
+
)
|
141 |
self.grobid_client = grobid_client
|
142 |
|
143 |
def process_structure(self, input_path, coordinates=False):
|
client.py → document_qa/ner_client_generic.py
RENAMED
@@ -1,3 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
""" Generic API Client """
|
2 |
from copy import deepcopy
|
3 |
import json
|
@@ -121,7 +133,7 @@ class ApiClient(object):
|
|
121 |
params = deepcopy(params) or {}
|
122 |
data = data or {}
|
123 |
files = files or {}
|
124 |
-
#if self.username is not None and self.api_key is not None:
|
125 |
# params.update(self.get_credentials())
|
126 |
r = requests.request(
|
127 |
method,
|
@@ -223,3 +235,227 @@ class ApiClient(object):
|
|
223 |
params={'format': 'json'},
|
224 |
**kwargs
|
225 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import time
|
3 |
+
|
4 |
+
import yaml
|
5 |
+
|
6 |
+
'''
|
7 |
+
This client is a generic client for any Grobid application and sub-modules.
|
8 |
+
At the moment, it supports only single document processing.
|
9 |
+
|
10 |
+
Source: https://github.com/kermitt2/grobid-client-python
|
11 |
+
'''
|
12 |
+
|
13 |
""" Generic API Client """
|
14 |
from copy import deepcopy
|
15 |
import json
|
|
|
133 |
params = deepcopy(params) or {}
|
134 |
data = data or {}
|
135 |
files = files or {}
|
136 |
+
# if self.username is not None and self.api_key is not None:
|
137 |
# params.update(self.get_credentials())
|
138 |
r = requests.request(
|
139 |
method,
|
|
|
235 |
params={'format': 'json'},
|
236 |
**kwargs
|
237 |
)
|
238 |
+
|
239 |
+
|
240 |
+
class NERClientGeneric(ApiClient):
|
241 |
+
|
242 |
+
def __init__(self, config_path=None, ping=False):
|
243 |
+
self.config = None
|
244 |
+
if config_path is not None:
|
245 |
+
self.config = self._load_yaml_config_from_file(path=config_path)
|
246 |
+
super().__init__(self.config['grobid']['server'])
|
247 |
+
|
248 |
+
if ping:
|
249 |
+
result = self.ping_service()
|
250 |
+
if not result:
|
251 |
+
raise Exception("Grobid is down.")
|
252 |
+
|
253 |
+
os.environ['NO_PROXY'] = "nims.go.jp"
|
254 |
+
|
255 |
+
@staticmethod
|
256 |
+
def _load_json_config_from_file(path='./config.json'):
|
257 |
+
"""
|
258 |
+
Load the json configuration
|
259 |
+
"""
|
260 |
+
config = {}
|
261 |
+
with open(path, 'r') as fp:
|
262 |
+
config = json.load(fp)
|
263 |
+
|
264 |
+
return config
|
265 |
+
|
266 |
+
@staticmethod
|
267 |
+
def _load_yaml_config_from_file(path='./config.yaml'):
|
268 |
+
"""
|
269 |
+
Load the YAML configuration
|
270 |
+
"""
|
271 |
+
config = {}
|
272 |
+
try:
|
273 |
+
with open(path, 'r') as the_file:
|
274 |
+
raw_configuration = the_file.read()
|
275 |
+
|
276 |
+
config = yaml.safe_load(raw_configuration)
|
277 |
+
except Exception as e:
|
278 |
+
print("Configuration could not be loaded: ", str(e))
|
279 |
+
exit(1)
|
280 |
+
|
281 |
+
return config
|
282 |
+
|
283 |
+
def set_config(self, config, ping=False):
|
284 |
+
self.config = config
|
285 |
+
if ping:
|
286 |
+
try:
|
287 |
+
result = self.ping_service()
|
288 |
+
if not result:
|
289 |
+
raise Exception("Grobid is down.")
|
290 |
+
except Exception as e:
|
291 |
+
raise Exception("Grobid is down or other problems were encountered. ", e)
|
292 |
+
|
293 |
+
def ping_service(self):
|
294 |
+
# test if the server is up and running...
|
295 |
+
ping_url = self.get_url("ping")
|
296 |
+
|
297 |
+
r = requests.get(ping_url)
|
298 |
+
status = r.status_code
|
299 |
+
|
300 |
+
if status != 200:
|
301 |
+
print('GROBID server does not appear up and running ' + str(status))
|
302 |
+
return False
|
303 |
+
else:
|
304 |
+
print("GROBID server is up and running")
|
305 |
+
return True
|
306 |
+
|
307 |
+
def get_url(self, action):
|
308 |
+
grobid_config = self.config['grobid']
|
309 |
+
base_url = grobid_config['server']
|
310 |
+
action_url = base_url + grobid_config['url_mapping'][action]
|
311 |
+
|
312 |
+
return action_url
|
313 |
+
|
314 |
+
def process_texts(self, input, method_name='superconductors', params={}, headers={"Accept": "application/json"}):
|
315 |
+
|
316 |
+
files = {
|
317 |
+
'texts': input
|
318 |
+
}
|
319 |
+
|
320 |
+
the_url = self.get_url(method_name)
|
321 |
+
params, the_url = self.get_params_from_url(the_url)
|
322 |
+
|
323 |
+
res, status = self.post(
|
324 |
+
url=the_url,
|
325 |
+
files=files,
|
326 |
+
data=params,
|
327 |
+
headers=headers
|
328 |
+
)
|
329 |
+
|
330 |
+
if status == 503:
|
331 |
+
time.sleep(self.config['sleep_time'])
|
332 |
+
return self.process_texts(input, method_name, params, headers)
|
333 |
+
elif status != 200:
|
334 |
+
print('Processing failed with error ' + str(status))
|
335 |
+
return status, None
|
336 |
+
else:
|
337 |
+
return status, json.loads(res.text)
|
338 |
+
|
339 |
+
def process_text(self, input, method_name='superconductors', params={}, headers={"Accept": "application/json"}):
|
340 |
+
|
341 |
+
files = {
|
342 |
+
'text': input
|
343 |
+
}
|
344 |
+
|
345 |
+
the_url = self.get_url(method_name)
|
346 |
+
params, the_url = self.get_params_from_url(the_url)
|
347 |
+
|
348 |
+
res, status = self.post(
|
349 |
+
url=the_url,
|
350 |
+
files=files,
|
351 |
+
data=params,
|
352 |
+
headers=headers
|
353 |
+
)
|
354 |
+
|
355 |
+
if status == 503:
|
356 |
+
time.sleep(self.config['sleep_time'])
|
357 |
+
return self.process_text(input, method_name, params, headers)
|
358 |
+
elif status != 200:
|
359 |
+
print('Processing failed with error ' + str(status))
|
360 |
+
return status, None
|
361 |
+
else:
|
362 |
+
return status, json.loads(res.text)
|
363 |
+
|
364 |
+
def process_pdf(self,
|
365 |
+
form_data: dict,
|
366 |
+
method_name='superconductors',
|
367 |
+
params={},
|
368 |
+
headers={"Accept": "application/json"}
|
369 |
+
):
|
370 |
+
|
371 |
+
the_url = self.get_url(method_name)
|
372 |
+
params, the_url = self.get_params_from_url(the_url)
|
373 |
+
|
374 |
+
res, status = self.post(
|
375 |
+
url=the_url,
|
376 |
+
files=form_data,
|
377 |
+
data=params,
|
378 |
+
headers=headers
|
379 |
+
)
|
380 |
+
|
381 |
+
if status == 503:
|
382 |
+
time.sleep(self.config['sleep_time'])
|
383 |
+
return self.process_text(input, method_name, params, headers)
|
384 |
+
elif status != 200:
|
385 |
+
print('Processing failed with error ' + str(status))
|
386 |
+
else:
|
387 |
+
return res.text
|
388 |
+
|
389 |
+
def process_pdfs(self, pdf_files, params={}):
|
390 |
+
pass
|
391 |
+
|
392 |
+
def process_pdf(
|
393 |
+
self,
|
394 |
+
pdf_file,
|
395 |
+
method_name,
|
396 |
+
params={},
|
397 |
+
headers={"Accept": "application/json"},
|
398 |
+
verbose=False,
|
399 |
+
retry=None
|
400 |
+
):
|
401 |
+
|
402 |
+
files = {
|
403 |
+
'input': (
|
404 |
+
pdf_file,
|
405 |
+
open(pdf_file, 'rb'),
|
406 |
+
'application/pdf',
|
407 |
+
{'Expires': '0'}
|
408 |
+
)
|
409 |
+
}
|
410 |
+
|
411 |
+
the_url = self.get_url(method_name)
|
412 |
+
|
413 |
+
params, the_url = self.get_params_from_url(the_url)
|
414 |
+
|
415 |
+
res, status = self.post(
|
416 |
+
url=the_url,
|
417 |
+
files=files,
|
418 |
+
data=params,
|
419 |
+
headers=headers
|
420 |
+
)
|
421 |
+
|
422 |
+
if status == 503 or status == 429:
|
423 |
+
if retry is None:
|
424 |
+
retry = self.config['max_retry'] - 1
|
425 |
+
else:
|
426 |
+
if retry - 1 == 0:
|
427 |
+
if verbose:
|
428 |
+
print("re-try exhausted. Aborting request")
|
429 |
+
return None, status
|
430 |
+
else:
|
431 |
+
retry -= 1
|
432 |
+
|
433 |
+
sleep_time = self.config['sleep_time']
|
434 |
+
if verbose:
|
435 |
+
print("Server is saturated, waiting", sleep_time, "seconds and trying again. ")
|
436 |
+
time.sleep(sleep_time)
|
437 |
+
return self.process_pdf(pdf_file, method_name, params, headers, verbose=verbose, retry=retry)
|
438 |
+
elif status != 200:
|
439 |
+
desc = None
|
440 |
+
if res.content:
|
441 |
+
c = json.loads(res.text)
|
442 |
+
desc = c['description'] if 'description' in c else None
|
443 |
+
return desc, status
|
444 |
+
elif status == 204:
|
445 |
+
# print('No content returned. Moving on. ')
|
446 |
+
return None, status
|
447 |
+
else:
|
448 |
+
return res.text, status
|
449 |
+
|
450 |
+
def get_params_from_url(self, the_url):
|
451 |
+
"""
|
452 |
+
This method is used to pass to the URL predefined parameters, which are added in the URL format
|
453 |
+
"""
|
454 |
+
params = {}
|
455 |
+
if "?" in the_url:
|
456 |
+
split = the_url.split("?")
|
457 |
+
the_url = split[0]
|
458 |
+
params = split[1]
|
459 |
+
|
460 |
+
params = {param.split("=")[0]: param.split("=")[1] for param in params.split("&")}
|
461 |
+
return params, the_url
|
grobid_client_generic.py
DELETED
@@ -1,264 +0,0 @@
|
|
1 |
-
import json
|
2 |
-
import os
|
3 |
-
import time
|
4 |
-
|
5 |
-
import requests
|
6 |
-
import yaml
|
7 |
-
|
8 |
-
from client import ApiClient
|
9 |
-
|
10 |
-
'''
|
11 |
-
This client is a generic client for any Grobid application and sub-modules.
|
12 |
-
At the moment, it supports only single document processing.
|
13 |
-
|
14 |
-
Source: https://github.com/kermitt2/grobid-client-python
|
15 |
-
'''
|
16 |
-
|
17 |
-
|
18 |
-
class GrobidClientGeneric(ApiClient):
|
19 |
-
|
20 |
-
def __init__(self, config_path=None, ping=False):
|
21 |
-
self.config = None
|
22 |
-
if config_path is not None:
|
23 |
-
self.config = self.load_yaml_config_from_file(path=config_path)
|
24 |
-
super().__init__(self.config['grobid']['server'])
|
25 |
-
|
26 |
-
if ping:
|
27 |
-
result = self.ping_grobid()
|
28 |
-
if not result:
|
29 |
-
raise Exception("Grobid is down.")
|
30 |
-
|
31 |
-
os.environ['NO_PROXY'] = "nims.go.jp"
|
32 |
-
|
33 |
-
@staticmethod
|
34 |
-
def load_json_config_from_file(self, path='./config.json', ping=False):
|
35 |
-
"""
|
36 |
-
Load the json configuration
|
37 |
-
"""
|
38 |
-
config = {}
|
39 |
-
with open(path, 'r') as fp:
|
40 |
-
config = json.load(fp)
|
41 |
-
|
42 |
-
if ping:
|
43 |
-
result = self.ping_grobid()
|
44 |
-
if not result:
|
45 |
-
raise Exception("Grobid is down.")
|
46 |
-
|
47 |
-
return config
|
48 |
-
|
49 |
-
def load_yaml_config_from_file(self, path='./config.yaml'):
|
50 |
-
"""
|
51 |
-
Load the YAML configuration
|
52 |
-
"""
|
53 |
-
config = {}
|
54 |
-
try:
|
55 |
-
with open(path, 'r') as the_file:
|
56 |
-
raw_configuration = the_file.read()
|
57 |
-
|
58 |
-
config = yaml.safe_load(raw_configuration)
|
59 |
-
except Exception as e:
|
60 |
-
print("Configuration could not be loaded: ", str(e))
|
61 |
-
exit(1)
|
62 |
-
|
63 |
-
return config
|
64 |
-
|
65 |
-
def set_config(self, config, ping=False):
|
66 |
-
self.config = config
|
67 |
-
if ping:
|
68 |
-
try:
|
69 |
-
result = self.ping_grobid()
|
70 |
-
if not result:
|
71 |
-
raise Exception("Grobid is down.")
|
72 |
-
except Exception as e:
|
73 |
-
raise Exception("Grobid is down or other problems were encountered. ", e)
|
74 |
-
|
75 |
-
def ping_grobid(self):
|
76 |
-
# test if the server is up and running...
|
77 |
-
ping_url = self.get_grobid_url("ping")
|
78 |
-
|
79 |
-
r = requests.get(ping_url)
|
80 |
-
status = r.status_code
|
81 |
-
|
82 |
-
if status != 200:
|
83 |
-
print('GROBID server does not appear up and running ' + str(status))
|
84 |
-
return False
|
85 |
-
else:
|
86 |
-
print("GROBID server is up and running")
|
87 |
-
return True
|
88 |
-
|
89 |
-
def get_grobid_url(self, action):
|
90 |
-
grobid_config = self.config['grobid']
|
91 |
-
base_url = grobid_config['server']
|
92 |
-
action_url = base_url + grobid_config['url_mapping'][action]
|
93 |
-
|
94 |
-
return action_url
|
95 |
-
|
96 |
-
def process_texts(self, input, method_name='superconductors', params={}, headers={"Accept": "application/json"}):
|
97 |
-
|
98 |
-
files = {
|
99 |
-
'texts': input
|
100 |
-
}
|
101 |
-
|
102 |
-
the_url = self.get_grobid_url(method_name)
|
103 |
-
params, the_url = self.get_params_from_url(the_url)
|
104 |
-
|
105 |
-
res, status = self.post(
|
106 |
-
url=the_url,
|
107 |
-
files=files,
|
108 |
-
data=params,
|
109 |
-
headers=headers
|
110 |
-
)
|
111 |
-
|
112 |
-
if status == 503:
|
113 |
-
time.sleep(self.config['sleep_time'])
|
114 |
-
return self.process_texts(input, method_name, params, headers)
|
115 |
-
elif status != 200:
|
116 |
-
print('Processing failed with error ' + str(status))
|
117 |
-
return status, None
|
118 |
-
else:
|
119 |
-
return status, json.loads(res.text)
|
120 |
-
|
121 |
-
def process_text(self, input, method_name='superconductors', params={}, headers={"Accept": "application/json"}):
|
122 |
-
|
123 |
-
files = {
|
124 |
-
'text': input
|
125 |
-
}
|
126 |
-
|
127 |
-
the_url = self.get_grobid_url(method_name)
|
128 |
-
params, the_url = self.get_params_from_url(the_url)
|
129 |
-
|
130 |
-
res, status = self.post(
|
131 |
-
url=the_url,
|
132 |
-
files=files,
|
133 |
-
data=params,
|
134 |
-
headers=headers
|
135 |
-
)
|
136 |
-
|
137 |
-
if status == 503:
|
138 |
-
time.sleep(self.config['sleep_time'])
|
139 |
-
return self.process_text(input, method_name, params, headers)
|
140 |
-
elif status != 200:
|
141 |
-
print('Processing failed with error ' + str(status))
|
142 |
-
return status, None
|
143 |
-
else:
|
144 |
-
return status, json.loads(res.text)
|
145 |
-
|
146 |
-
def process(self, form_data: dict, method_name='superconductors', params={}, headers={"Accept": "application/json"}):
|
147 |
-
|
148 |
-
the_url = self.get_grobid_url(method_name)
|
149 |
-
params, the_url = self.get_params_from_url(the_url)
|
150 |
-
|
151 |
-
res, status = self.post(
|
152 |
-
url=the_url,
|
153 |
-
files=form_data,
|
154 |
-
data=params,
|
155 |
-
headers=headers
|
156 |
-
)
|
157 |
-
|
158 |
-
if status == 503:
|
159 |
-
time.sleep(self.config['sleep_time'])
|
160 |
-
return self.process_text(input, method_name, params, headers)
|
161 |
-
elif status != 200:
|
162 |
-
print('Processing failed with error ' + str(status))
|
163 |
-
else:
|
164 |
-
return res.text
|
165 |
-
|
166 |
-
def process_pdf_batch(self, pdf_files, params={}):
|
167 |
-
pass
|
168 |
-
|
169 |
-
def process_pdf(self, pdf_file, method_name, params={}, headers={"Accept": "application/json"}, verbose=False,
|
170 |
-
retry=None):
|
171 |
-
|
172 |
-
files = {
|
173 |
-
'input': (
|
174 |
-
pdf_file,
|
175 |
-
open(pdf_file, 'rb'),
|
176 |
-
'application/pdf',
|
177 |
-
{'Expires': '0'}
|
178 |
-
)
|
179 |
-
}
|
180 |
-
|
181 |
-
the_url = self.get_grobid_url(method_name)
|
182 |
-
|
183 |
-
params, the_url = self.get_params_from_url(the_url)
|
184 |
-
|
185 |
-
res, status = self.post(
|
186 |
-
url=the_url,
|
187 |
-
files=files,
|
188 |
-
data=params,
|
189 |
-
headers=headers
|
190 |
-
)
|
191 |
-
|
192 |
-
if status == 503 or status == 429:
|
193 |
-
if retry is None:
|
194 |
-
retry = self.config['max_retry'] - 1
|
195 |
-
else:
|
196 |
-
if retry - 1 == 0:
|
197 |
-
if verbose:
|
198 |
-
print("re-try exhausted. Aborting request")
|
199 |
-
return None, status
|
200 |
-
else:
|
201 |
-
retry -= 1
|
202 |
-
|
203 |
-
sleep_time = self.config['sleep_time']
|
204 |
-
if verbose:
|
205 |
-
print("Server is saturated, waiting", sleep_time, "seconds and trying again. ")
|
206 |
-
time.sleep(sleep_time)
|
207 |
-
return self.process_pdf(pdf_file, method_name, params, headers, verbose=verbose, retry=retry)
|
208 |
-
elif status != 200:
|
209 |
-
desc = None
|
210 |
-
if res.content:
|
211 |
-
c = json.loads(res.text)
|
212 |
-
desc = c['description'] if 'description' in c else None
|
213 |
-
return desc, status
|
214 |
-
elif status == 204:
|
215 |
-
# print('No content returned. Moving on. ')
|
216 |
-
return None, status
|
217 |
-
else:
|
218 |
-
return res.text, status
|
219 |
-
|
220 |
-
def get_params_from_url(self, the_url):
|
221 |
-
params = {}
|
222 |
-
if "?" in the_url:
|
223 |
-
split = the_url.split("?")
|
224 |
-
the_url = split[0]
|
225 |
-
params = split[1]
|
226 |
-
|
227 |
-
params = {param.split("=")[0]: param.split("=")[1] for param in params.split("&")}
|
228 |
-
return params, the_url
|
229 |
-
|
230 |
-
def process_json(self, text, method_name="processJson", params={}, headers={"Accept": "application/json"},
|
231 |
-
verbose=False):
|
232 |
-
files = {
|
233 |
-
'input': (
|
234 |
-
None,
|
235 |
-
text,
|
236 |
-
'application/json',
|
237 |
-
{'Expires': '0'}
|
238 |
-
)
|
239 |
-
}
|
240 |
-
|
241 |
-
the_url = self.get_grobid_url(method_name)
|
242 |
-
|
243 |
-
params, the_url = self.get_params_from_url(the_url)
|
244 |
-
|
245 |
-
res, status = self.post(
|
246 |
-
url=the_url,
|
247 |
-
files=files,
|
248 |
-
data=params,
|
249 |
-
headers=headers
|
250 |
-
)
|
251 |
-
|
252 |
-
if status == 503:
|
253 |
-
time.sleep(self.config['sleep_time'])
|
254 |
-
return self.process_json(text, method_name, params, headers), status
|
255 |
-
elif status != 200:
|
256 |
-
if verbose:
|
257 |
-
print('Processing failed with error ', status)
|
258 |
-
return None, status
|
259 |
-
elif status == 204:
|
260 |
-
if verbose:
|
261 |
-
print('No content returned. Moving on. ')
|
262 |
-
return None, status
|
263 |
-
else:
|
264 |
-
return res.text, status
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
streamlit_app.py
CHANGED
@@ -441,7 +441,7 @@ with right_column:
|
|
441 |
text_response = None
|
442 |
if mode == "Embeddings":
|
443 |
with st.spinner("Generating LLM response..."):
|
444 |
-
text_response = st.session_state['rqa'][model].
|
445 |
context_size=context_size)
|
446 |
elif mode == "LLM":
|
447 |
with st.spinner("Generating response..."):
|
|
|
441 |
text_response = None
|
442 |
if mode == "Embeddings":
|
443 |
with st.spinner("Generating LLM response..."):
|
444 |
+
text_response = st.session_state['rqa'][model].query_storage_and_embeddings(question, st.session_state.doc_id,
|
445 |
context_size=context_size)
|
446 |
elif mode == "LLM":
|
447 |
with st.spinner("Generating response..."):
|