lfoppiano commited on
Commit
f684be7
1 Parent(s): eedfb73

decouple quantities and superconductors

Browse files
Files changed (1) hide show
  1. document_qa/grobid_processors.py +17 -53
document_qa/grobid_processors.py CHANGED
@@ -7,7 +7,6 @@ import dateparser
7
  import grobid_tei_xml
8
  from bs4 import BeautifulSoup
9
  from grobid_client.grobid_client import GrobidClient
10
- from tqdm import tqdm
11
 
12
 
13
  def get_span_start(type, title=None):
@@ -55,49 +54,6 @@ def decorate_text_with_annotations(text, spans, tag="span"):
55
  return annotated_text
56
 
57
 
58
- def extract_quantities(client, x_all, column_text_index):
59
- # relevant_items = ['magnetic field strength', 'magnetic induction', 'maximum energy product',
60
- # "magnetic flux density", "magnetic flux"]
61
- # property_keywords = ['coercivity', 'remanence']
62
-
63
- output_data = []
64
-
65
- for idx, example in tqdm(enumerate(x_all), desc="extract quantities"):
66
- text = example[column_text_index]
67
- spans = GrobidQuantitiesProcessor(client).extract_quantities(text)
68
-
69
- data_record = {
70
- "id": example[0],
71
- "filename": example[1],
72
- "passage_id": example[2],
73
- "text": text,
74
- "spans": spans
75
- }
76
-
77
- output_data.append(data_record)
78
-
79
- return output_data
80
-
81
-
82
- def extract_materials(client, x_all, column_text_index):
83
- output_data = []
84
-
85
- for idx, example in tqdm(enumerate(x_all), desc="extract materials"):
86
- text = example[column_text_index]
87
- spans = GrobidMaterialsProcessor(client).extract_materials(text)
88
- data_record = {
89
- "id": example[0],
90
- "filename": example[1],
91
- "passage_id": example[2],
92
- "text": text,
93
- "spans": spans
94
- }
95
-
96
- output_data.append(data_record)
97
-
98
- return output_data
99
-
100
-
101
  def get_parsed_value_type(quantity):
102
  if 'parsedValue' in quantity and 'structure' in quantity['parsedValue']:
103
  return quantity['parsedValue']['structure']['type']
@@ -199,7 +155,7 @@ class GrobidProcessor(BaseProcessor):
199
  "subSection": "<title>",
200
  "passage_id": "htitle",
201
  "coordinates": ";".join([node['coords'] if coordinates and node.has_attr('coords') else "" for node in
202
- blocks_header['authors']])
203
  })
204
 
205
  passages.append({
@@ -302,7 +258,7 @@ class GrobidQuantitiesProcessor(BaseProcessor):
302
  def __init__(self, grobid_quantities_client):
303
  self.grobid_quantities_client = grobid_quantities_client
304
 
305
- def extract_quantities(self, text):
306
  status, result = self.grobid_quantities_client.process_text(text.strip())
307
 
308
  if status != 200:
@@ -570,11 +526,12 @@ class GrobidMaterialsProcessor(BaseProcessor):
570
  return materials
571
 
572
 
573
- class GrobidAggregationProcessor(GrobidProcessor, GrobidQuantitiesProcessor, GrobidMaterialsProcessor):
574
- def __init__(self, grobid_client, grobid_quantities_client=None, grobid_superconductors_client=None):
575
- GrobidProcessor.__init__(self, grobid_client)
576
- self.gqp = GrobidQuantitiesProcessor(grobid_quantities_client)
577
- self.gmp = GrobidMaterialsProcessor(grobid_superconductors_client)
 
578
 
579
  def process_single_text(self, text):
580
  extracted_quantities_spans = self.gqp.extract_quantities(text)
@@ -584,10 +541,17 @@ class GrobidAggregationProcessor(GrobidProcessor, GrobidQuantitiesProcessor, Gro
584
  return entities
585
 
586
  def extract_quantities(self, text):
587
- return self.gqp.extract_quantities(text)
 
 
 
 
588
 
589
  def extract_materials(self, text):
590
- return self.gmp.extract_materials(text)
 
 
 
591
 
592
  @staticmethod
593
  def box_to_dict(box, color=None, type=None):
 
7
  import grobid_tei_xml
8
  from bs4 import BeautifulSoup
9
  from grobid_client.grobid_client import GrobidClient
 
10
 
11
 
12
  def get_span_start(type, title=None):
 
54
  return annotated_text
55
 
56
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  def get_parsed_value_type(quantity):
58
  if 'parsedValue' in quantity and 'structure' in quantity['parsedValue']:
59
  return quantity['parsedValue']['structure']['type']
 
155
  "subSection": "<title>",
156
  "passage_id": "htitle",
157
  "coordinates": ";".join([node['coords'] if coordinates and node.has_attr('coords') else "" for node in
158
+ blocks_header['authors']])
159
  })
160
 
161
  passages.append({
 
258
  def __init__(self, grobid_quantities_client):
259
  self.grobid_quantities_client = grobid_quantities_client
260
 
261
+ def extract_quantities(self, text) -> list:
262
  status, result = self.grobid_quantities_client.process_text(text.strip())
263
 
264
  if status != 200:
 
526
  return materials
527
 
528
 
529
+ class GrobidAggregationProcessor(GrobidQuantitiesProcessor, GrobidMaterialsProcessor):
530
+ def __init__(self, grobid_quantities_client=None, grobid_superconductors_client=None):
531
+ if grobid_quantities_client:
532
+ self.gqp = GrobidQuantitiesProcessor(grobid_quantities_client)
533
+ if grobid_superconductors_client:
534
+ self.gmp = GrobidMaterialsProcessor(grobid_superconductors_client)
535
 
536
  def process_single_text(self, text):
537
  extracted_quantities_spans = self.gqp.extract_quantities(text)
 
541
  return entities
542
 
543
  def extract_quantities(self, text):
544
+ if self.gqp:
545
+ return self.gqp.extract_quantities(text)
546
+ else:
547
+ return []
548
+
549
 
550
  def extract_materials(self, text):
551
+ if self.gmp:
552
+ return self.gmp.extract_materials(text)
553
+ else:
554
+ return []
555
 
556
  @staticmethod
557
  def box_to_dict(box, color=None, type=None):