lfoppiano commited on
Commit
eedfb73
1 Parent(s): c08e73a

refactoring grobid client generic

Browse files
document_qa/document_qa_engine.py CHANGED
@@ -85,6 +85,9 @@ class TextMerger:
85
 
86
  return new_passages_struct
87
 
 
 
 
88
 
89
  class DocumentQAEngine:
90
  llm = None
@@ -123,16 +126,7 @@ class DocumentQAEngine:
123
  self.load_embeddings(self.embeddings_root_path)
124
 
125
  if grobid_url:
126
- self.grobid_url = grobid_url
127
- grobid_client = GrobidClient(
128
- grobid_server=self.grobid_url,
129
- batch_size=1000,
130
- coordinates=["p", "title", "persName"],
131
- sleep_time=5,
132
- timeout=60,
133
- check_server=True
134
- )
135
- self.grobid_processor = GrobidProcessor(grobid_client)
136
 
137
  def load_embeddings(self, embeddings_root_path: Union[str, Path]) -> None:
138
  """
@@ -204,6 +198,16 @@ class DocumentQAEngine:
204
  context_as_text = [doc.page_content for doc in documents]
205
  return context_as_text
206
 
 
 
 
 
 
 
 
 
 
 
207
  def _parse_json(self, response, output_parser):
208
  system_message = "You are an useful assistant expert in materials science, physics, and chemistry " \
209
  "that can process text and transform it to JSON."
 
85
 
86
  return new_passages_struct
87
 
88
+ class DataStorage:
89
+
90
+
91
 
92
  class DocumentQAEngine:
93
  llm = None
 
126
  self.load_embeddings(self.embeddings_root_path)
127
 
128
  if grobid_url:
129
+ self.grobid_processor = GrobidProcessor(grobid_url)
 
 
 
 
 
 
 
 
 
130
 
131
  def load_embeddings(self, embeddings_root_path: Union[str, Path]) -> None:
132
  """
 
198
  context_as_text = [doc.page_content for doc in documents]
199
  return context_as_text
200
 
201
+ def query_storage_and_embeddings(self, query: str, doc_id, context_size=4):
202
+ db = self.embeddings_dict[doc_id]
203
+ retriever = db.as_retriever(search_kwargs={"k": context_size})
204
+ relevant_documents = retriever.get_relevant_documents(query, include=["embeddings"])
205
+
206
+ context_as_text = [doc.page_content for doc in relevant_documents]
207
+ return context_as_text
208
+
209
+ # chroma_collection.get(include=['embeddings'])['embeddings']
210
+
211
  def _parse_json(self, response, output_parser):
212
  system_message = "You are an useful assistant expert in materials science, physics, and chemistry " \
213
  "that can process text and transform it to JSON."
document_qa/grobid_processors.py CHANGED
@@ -6,6 +6,7 @@ from pathlib import Path
6
  import dateparser
7
  import grobid_tei_xml
8
  from bs4 import BeautifulSoup
 
9
  from tqdm import tqdm
10
 
11
 
@@ -127,8 +128,16 @@ class BaseProcessor(object):
127
 
128
 
129
  class GrobidProcessor(BaseProcessor):
130
- def __init__(self, grobid_client):
131
  # super().__init__()
 
 
 
 
 
 
 
 
132
  self.grobid_client = grobid_client
133
 
134
  def process_structure(self, input_path, coordinates=False):
 
6
  import dateparser
7
  import grobid_tei_xml
8
  from bs4 import BeautifulSoup
9
+ from grobid_client.grobid_client import GrobidClient
10
  from tqdm import tqdm
11
 
12
 
 
128
 
129
 
130
  class GrobidProcessor(BaseProcessor):
131
+ def __init__(self, grobid_url, ping_server=True):
132
  # super().__init__()
133
+ grobid_client = GrobidClient(
134
+ grobid_server=grobid_url,
135
+ batch_size=5,
136
+ coordinates=["p", "title", "persName"],
137
+ sleep_time=5,
138
+ timeout=60,
139
+ check_server=ping_server
140
+ )
141
  self.grobid_client = grobid_client
142
 
143
  def process_structure(self, input_path, coordinates=False):
client.py → document_qa/ner_client_generic.py RENAMED
@@ -1,3 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
  """ Generic API Client """
2
  from copy import deepcopy
3
  import json
@@ -121,7 +133,7 @@ class ApiClient(object):
121
  params = deepcopy(params) or {}
122
  data = data or {}
123
  files = files or {}
124
- #if self.username is not None and self.api_key is not None:
125
  # params.update(self.get_credentials())
126
  r = requests.request(
127
  method,
@@ -223,3 +235,227 @@ class ApiClient(object):
223
  params={'format': 'json'},
224
  **kwargs
225
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+
4
+ import yaml
5
+
6
+ '''
7
+ This client is a generic client for any Grobid application and sub-modules.
8
+ At the moment, it supports only single document processing.
9
+
10
+ Source: https://github.com/kermitt2/grobid-client-python
11
+ '''
12
+
13
  """ Generic API Client """
14
  from copy import deepcopy
15
  import json
 
133
  params = deepcopy(params) or {}
134
  data = data or {}
135
  files = files or {}
136
+ # if self.username is not None and self.api_key is not None:
137
  # params.update(self.get_credentials())
138
  r = requests.request(
139
  method,
 
235
  params={'format': 'json'},
236
  **kwargs
237
  )
238
+
239
+
240
+ class NERClientGeneric(ApiClient):
241
+
242
+ def __init__(self, config_path=None, ping=False):
243
+ self.config = None
244
+ if config_path is not None:
245
+ self.config = self._load_yaml_config_from_file(path=config_path)
246
+ super().__init__(self.config['grobid']['server'])
247
+
248
+ if ping:
249
+ result = self.ping_service()
250
+ if not result:
251
+ raise Exception("Grobid is down.")
252
+
253
+ os.environ['NO_PROXY'] = "nims.go.jp"
254
+
255
+ @staticmethod
256
+ def _load_json_config_from_file(path='./config.json'):
257
+ """
258
+ Load the json configuration
259
+ """
260
+ config = {}
261
+ with open(path, 'r') as fp:
262
+ config = json.load(fp)
263
+
264
+ return config
265
+
266
+ @staticmethod
267
+ def _load_yaml_config_from_file(path='./config.yaml'):
268
+ """
269
+ Load the YAML configuration
270
+ """
271
+ config = {}
272
+ try:
273
+ with open(path, 'r') as the_file:
274
+ raw_configuration = the_file.read()
275
+
276
+ config = yaml.safe_load(raw_configuration)
277
+ except Exception as e:
278
+ print("Configuration could not be loaded: ", str(e))
279
+ exit(1)
280
+
281
+ return config
282
+
283
+ def set_config(self, config, ping=False):
284
+ self.config = config
285
+ if ping:
286
+ try:
287
+ result = self.ping_service()
288
+ if not result:
289
+ raise Exception("Grobid is down.")
290
+ except Exception as e:
291
+ raise Exception("Grobid is down or other problems were encountered. ", e)
292
+
293
+ def ping_service(self):
294
+ # test if the server is up and running...
295
+ ping_url = self.get_url("ping")
296
+
297
+ r = requests.get(ping_url)
298
+ status = r.status_code
299
+
300
+ if status != 200:
301
+ print('GROBID server does not appear up and running ' + str(status))
302
+ return False
303
+ else:
304
+ print("GROBID server is up and running")
305
+ return True
306
+
307
+ def get_url(self, action):
308
+ grobid_config = self.config['grobid']
309
+ base_url = grobid_config['server']
310
+ action_url = base_url + grobid_config['url_mapping'][action]
311
+
312
+ return action_url
313
+
314
+ def process_texts(self, input, method_name='superconductors', params={}, headers={"Accept": "application/json"}):
315
+
316
+ files = {
317
+ 'texts': input
318
+ }
319
+
320
+ the_url = self.get_url(method_name)
321
+ params, the_url = self.get_params_from_url(the_url)
322
+
323
+ res, status = self.post(
324
+ url=the_url,
325
+ files=files,
326
+ data=params,
327
+ headers=headers
328
+ )
329
+
330
+ if status == 503:
331
+ time.sleep(self.config['sleep_time'])
332
+ return self.process_texts(input, method_name, params, headers)
333
+ elif status != 200:
334
+ print('Processing failed with error ' + str(status))
335
+ return status, None
336
+ else:
337
+ return status, json.loads(res.text)
338
+
339
+ def process_text(self, input, method_name='superconductors', params={}, headers={"Accept": "application/json"}):
340
+
341
+ files = {
342
+ 'text': input
343
+ }
344
+
345
+ the_url = self.get_url(method_name)
346
+ params, the_url = self.get_params_from_url(the_url)
347
+
348
+ res, status = self.post(
349
+ url=the_url,
350
+ files=files,
351
+ data=params,
352
+ headers=headers
353
+ )
354
+
355
+ if status == 503:
356
+ time.sleep(self.config['sleep_time'])
357
+ return self.process_text(input, method_name, params, headers)
358
+ elif status != 200:
359
+ print('Processing failed with error ' + str(status))
360
+ return status, None
361
+ else:
362
+ return status, json.loads(res.text)
363
+
364
+ def process_pdf(self,
365
+ form_data: dict,
366
+ method_name='superconductors',
367
+ params={},
368
+ headers={"Accept": "application/json"}
369
+ ):
370
+
371
+ the_url = self.get_url(method_name)
372
+ params, the_url = self.get_params_from_url(the_url)
373
+
374
+ res, status = self.post(
375
+ url=the_url,
376
+ files=form_data,
377
+ data=params,
378
+ headers=headers
379
+ )
380
+
381
+ if status == 503:
382
+ time.sleep(self.config['sleep_time'])
383
+ return self.process_text(input, method_name, params, headers)
384
+ elif status != 200:
385
+ print('Processing failed with error ' + str(status))
386
+ else:
387
+ return res.text
388
+
389
+ def process_pdfs(self, pdf_files, params={}):
390
+ pass
391
+
392
+ def process_pdf(
393
+ self,
394
+ pdf_file,
395
+ method_name,
396
+ params={},
397
+ headers={"Accept": "application/json"},
398
+ verbose=False,
399
+ retry=None
400
+ ):
401
+
402
+ files = {
403
+ 'input': (
404
+ pdf_file,
405
+ open(pdf_file, 'rb'),
406
+ 'application/pdf',
407
+ {'Expires': '0'}
408
+ )
409
+ }
410
+
411
+ the_url = self.get_url(method_name)
412
+
413
+ params, the_url = self.get_params_from_url(the_url)
414
+
415
+ res, status = self.post(
416
+ url=the_url,
417
+ files=files,
418
+ data=params,
419
+ headers=headers
420
+ )
421
+
422
+ if status == 503 or status == 429:
423
+ if retry is None:
424
+ retry = self.config['max_retry'] - 1
425
+ else:
426
+ if retry - 1 == 0:
427
+ if verbose:
428
+ print("re-try exhausted. Aborting request")
429
+ return None, status
430
+ else:
431
+ retry -= 1
432
+
433
+ sleep_time = self.config['sleep_time']
434
+ if verbose:
435
+ print("Server is saturated, waiting", sleep_time, "seconds and trying again. ")
436
+ time.sleep(sleep_time)
437
+ return self.process_pdf(pdf_file, method_name, params, headers, verbose=verbose, retry=retry)
438
+ elif status != 200:
439
+ desc = None
440
+ if res.content:
441
+ c = json.loads(res.text)
442
+ desc = c['description'] if 'description' in c else None
443
+ return desc, status
444
+ elif status == 204:
445
+ # print('No content returned. Moving on. ')
446
+ return None, status
447
+ else:
448
+ return res.text, status
449
+
450
+ def get_params_from_url(self, the_url):
451
+ """
452
+ This method is used to pass to the URL predefined parameters, which are added in the URL format
453
+ """
454
+ params = {}
455
+ if "?" in the_url:
456
+ split = the_url.split("?")
457
+ the_url = split[0]
458
+ params = split[1]
459
+
460
+ params = {param.split("=")[0]: param.split("=")[1] for param in params.split("&")}
461
+ return params, the_url
grobid_client_generic.py DELETED
@@ -1,264 +0,0 @@
1
- import json
2
- import os
3
- import time
4
-
5
- import requests
6
- import yaml
7
-
8
- from client import ApiClient
9
-
10
- '''
11
- This client is a generic client for any Grobid application and sub-modules.
12
- At the moment, it supports only single document processing.
13
-
14
- Source: https://github.com/kermitt2/grobid-client-python
15
- '''
16
-
17
-
18
- class GrobidClientGeneric(ApiClient):
19
-
20
- def __init__(self, config_path=None, ping=False):
21
- self.config = None
22
- if config_path is not None:
23
- self.config = self.load_yaml_config_from_file(path=config_path)
24
- super().__init__(self.config['grobid']['server'])
25
-
26
- if ping:
27
- result = self.ping_grobid()
28
- if not result:
29
- raise Exception("Grobid is down.")
30
-
31
- os.environ['NO_PROXY'] = "nims.go.jp"
32
-
33
- @staticmethod
34
- def load_json_config_from_file(self, path='./config.json', ping=False):
35
- """
36
- Load the json configuration
37
- """
38
- config = {}
39
- with open(path, 'r') as fp:
40
- config = json.load(fp)
41
-
42
- if ping:
43
- result = self.ping_grobid()
44
- if not result:
45
- raise Exception("Grobid is down.")
46
-
47
- return config
48
-
49
- def load_yaml_config_from_file(self, path='./config.yaml'):
50
- """
51
- Load the YAML configuration
52
- """
53
- config = {}
54
- try:
55
- with open(path, 'r') as the_file:
56
- raw_configuration = the_file.read()
57
-
58
- config = yaml.safe_load(raw_configuration)
59
- except Exception as e:
60
- print("Configuration could not be loaded: ", str(e))
61
- exit(1)
62
-
63
- return config
64
-
65
- def set_config(self, config, ping=False):
66
- self.config = config
67
- if ping:
68
- try:
69
- result = self.ping_grobid()
70
- if not result:
71
- raise Exception("Grobid is down.")
72
- except Exception as e:
73
- raise Exception("Grobid is down or other problems were encountered. ", e)
74
-
75
- def ping_grobid(self):
76
- # test if the server is up and running...
77
- ping_url = self.get_grobid_url("ping")
78
-
79
- r = requests.get(ping_url)
80
- status = r.status_code
81
-
82
- if status != 200:
83
- print('GROBID server does not appear up and running ' + str(status))
84
- return False
85
- else:
86
- print("GROBID server is up and running")
87
- return True
88
-
89
- def get_grobid_url(self, action):
90
- grobid_config = self.config['grobid']
91
- base_url = grobid_config['server']
92
- action_url = base_url + grobid_config['url_mapping'][action]
93
-
94
- return action_url
95
-
96
- def process_texts(self, input, method_name='superconductors', params={}, headers={"Accept": "application/json"}):
97
-
98
- files = {
99
- 'texts': input
100
- }
101
-
102
- the_url = self.get_grobid_url(method_name)
103
- params, the_url = self.get_params_from_url(the_url)
104
-
105
- res, status = self.post(
106
- url=the_url,
107
- files=files,
108
- data=params,
109
- headers=headers
110
- )
111
-
112
- if status == 503:
113
- time.sleep(self.config['sleep_time'])
114
- return self.process_texts(input, method_name, params, headers)
115
- elif status != 200:
116
- print('Processing failed with error ' + str(status))
117
- return status, None
118
- else:
119
- return status, json.loads(res.text)
120
-
121
- def process_text(self, input, method_name='superconductors', params={}, headers={"Accept": "application/json"}):
122
-
123
- files = {
124
- 'text': input
125
- }
126
-
127
- the_url = self.get_grobid_url(method_name)
128
- params, the_url = self.get_params_from_url(the_url)
129
-
130
- res, status = self.post(
131
- url=the_url,
132
- files=files,
133
- data=params,
134
- headers=headers
135
- )
136
-
137
- if status == 503:
138
- time.sleep(self.config['sleep_time'])
139
- return self.process_text(input, method_name, params, headers)
140
- elif status != 200:
141
- print('Processing failed with error ' + str(status))
142
- return status, None
143
- else:
144
- return status, json.loads(res.text)
145
-
146
- def process(self, form_data: dict, method_name='superconductors', params={}, headers={"Accept": "application/json"}):
147
-
148
- the_url = self.get_grobid_url(method_name)
149
- params, the_url = self.get_params_from_url(the_url)
150
-
151
- res, status = self.post(
152
- url=the_url,
153
- files=form_data,
154
- data=params,
155
- headers=headers
156
- )
157
-
158
- if status == 503:
159
- time.sleep(self.config['sleep_time'])
160
- return self.process_text(input, method_name, params, headers)
161
- elif status != 200:
162
- print('Processing failed with error ' + str(status))
163
- else:
164
- return res.text
165
-
166
- def process_pdf_batch(self, pdf_files, params={}):
167
- pass
168
-
169
- def process_pdf(self, pdf_file, method_name, params={}, headers={"Accept": "application/json"}, verbose=False,
170
- retry=None):
171
-
172
- files = {
173
- 'input': (
174
- pdf_file,
175
- open(pdf_file, 'rb'),
176
- 'application/pdf',
177
- {'Expires': '0'}
178
- )
179
- }
180
-
181
- the_url = self.get_grobid_url(method_name)
182
-
183
- params, the_url = self.get_params_from_url(the_url)
184
-
185
- res, status = self.post(
186
- url=the_url,
187
- files=files,
188
- data=params,
189
- headers=headers
190
- )
191
-
192
- if status == 503 or status == 429:
193
- if retry is None:
194
- retry = self.config['max_retry'] - 1
195
- else:
196
- if retry - 1 == 0:
197
- if verbose:
198
- print("re-try exhausted. Aborting request")
199
- return None, status
200
- else:
201
- retry -= 1
202
-
203
- sleep_time = self.config['sleep_time']
204
- if verbose:
205
- print("Server is saturated, waiting", sleep_time, "seconds and trying again. ")
206
- time.sleep(sleep_time)
207
- return self.process_pdf(pdf_file, method_name, params, headers, verbose=verbose, retry=retry)
208
- elif status != 200:
209
- desc = None
210
- if res.content:
211
- c = json.loads(res.text)
212
- desc = c['description'] if 'description' in c else None
213
- return desc, status
214
- elif status == 204:
215
- # print('No content returned. Moving on. ')
216
- return None, status
217
- else:
218
- return res.text, status
219
-
220
- def get_params_from_url(self, the_url):
221
- params = {}
222
- if "?" in the_url:
223
- split = the_url.split("?")
224
- the_url = split[0]
225
- params = split[1]
226
-
227
- params = {param.split("=")[0]: param.split("=")[1] for param in params.split("&")}
228
- return params, the_url
229
-
230
- def process_json(self, text, method_name="processJson", params={}, headers={"Accept": "application/json"},
231
- verbose=False):
232
- files = {
233
- 'input': (
234
- None,
235
- text,
236
- 'application/json',
237
- {'Expires': '0'}
238
- )
239
- }
240
-
241
- the_url = self.get_grobid_url(method_name)
242
-
243
- params, the_url = self.get_params_from_url(the_url)
244
-
245
- res, status = self.post(
246
- url=the_url,
247
- files=files,
248
- data=params,
249
- headers=headers
250
- )
251
-
252
- if status == 503:
253
- time.sleep(self.config['sleep_time'])
254
- return self.process_json(text, method_name, params, headers), status
255
- elif status != 200:
256
- if verbose:
257
- print('Processing failed with error ', status)
258
- return None, status
259
- elif status == 204:
260
- if verbose:
261
- print('No content returned. Moving on. ')
262
- return None, status
263
- else:
264
- return res.text, status
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
streamlit_app.py CHANGED
@@ -441,7 +441,7 @@ with right_column:
441
  text_response = None
442
  if mode == "Embeddings":
443
  with st.spinner("Generating LLM response..."):
444
- text_response = st.session_state['rqa'][model].query_storage(question, st.session_state.doc_id,
445
  context_size=context_size)
446
  elif mode == "LLM":
447
  with st.spinner("Generating response..."):
 
441
  text_response = None
442
  if mode == "Embeddings":
443
  with st.spinner("Generating LLM response..."):
444
+ text_response = st.session_state['rqa'][model].query_storage_and_embeddings(question, st.session_state.doc_id,
445
  context_size=context_size)
446
  elif mode == "LLM":
447
  with st.spinner("Generating response..."):