Luca Foppiano commited on
Commit
104b3a9
·
unverified ·
1 Parent(s): c08e73a

refactor grobid processors (#29)

Browse files

refactor grobid processors, deprecate legacy methods

document_qa/document_qa_engine.py CHANGED
@@ -269,7 +269,7 @@ class DocumentQAEngine:
269
  print("File", pdf_file_path)
270
  filename = Path(pdf_file_path).stem
271
  coordinates = True # if chunk_size == -1 else False
272
- structure = self.grobid_processor.process_structure(pdf_file_path, coordinates=coordinates)
273
 
274
  biblio = structure['biblio']
275
  biblio['filename'] = filename.replace(" ", "_")
 
269
  print("File", pdf_file_path)
270
  filename = Path(pdf_file_path).stem
271
  coordinates = True # if chunk_size == -1 else False
272
+ structure = self.grobid_processor.process(pdf_file_path, coordinates=coordinates)
273
 
274
  biblio = structure['biblio']
275
  biblio['filename'] = filename.replace(" ", "_")
document_qa/grobid_processors.py CHANGED
@@ -2,6 +2,7 @@ import re
2
  from collections import OrderedDict
3
  from html import escape
4
  from pathlib import Path
 
5
 
6
  import dateparser
7
  import grobid_tei_xml
@@ -54,6 +55,7 @@ def decorate_text_with_annotations(text, spans, tag="span"):
54
  return annotated_text
55
 
56
 
 
57
  def extract_quantities(client, x_all, column_text_index):
58
  # relevant_items = ['magnetic field strength', 'magnetic induction', 'maximum energy product',
59
  # "magnetic flux density", "magnetic flux"]
@@ -63,7 +65,7 @@ def extract_quantities(client, x_all, column_text_index):
63
 
64
  for idx, example in tqdm(enumerate(x_all), desc="extract quantities"):
65
  text = example[column_text_index]
66
- spans = GrobidQuantitiesProcessor(client).extract_quantities(text)
67
 
68
  data_record = {
69
  "id": example[0],
@@ -78,12 +80,13 @@ def extract_quantities(client, x_all, column_text_index):
78
  return output_data
79
 
80
 
 
81
  def extract_materials(client, x_all, column_text_index):
82
  output_data = []
83
 
84
  for idx, example in tqdm(enumerate(x_all), desc="extract materials"):
85
  text = example[column_text_index]
86
- spans = GrobidMaterialsProcessor(client).extract_materials(text)
87
  data_record = {
88
  "id": example[0],
89
  "filename": example[1],
@@ -131,7 +134,7 @@ class GrobidProcessor(BaseProcessor):
131
  # super().__init__()
132
  self.grobid_client = grobid_client
133
 
134
- def process_structure(self, input_path, coordinates=False):
135
  pdf_file, status, text = self.grobid_client.process_pdf("processFulltextDocument",
136
  input_path,
137
  consolidate_header=True,
@@ -145,19 +148,10 @@ class GrobidProcessor(BaseProcessor):
145
  if status != 200:
146
  return
147
 
148
- output_data = self.parse_grobid_xml(text, coordinates=coordinates)
149
- output_data['filename'] = Path(pdf_file).stem.replace(".tei", "")
150
 
151
- return output_data
152
-
153
- def process_single(self, input_file):
154
- doc = self.process_structure(input_file)
155
-
156
- for paragraph in doc['passages']:
157
- entities = self.process_single_text(paragraph['text'])
158
- paragraph['spans'] = entities
159
-
160
- return doc
161
 
162
  def parse_grobid_xml(self, text, coordinates=False):
163
  output_data = OrderedDict()
@@ -187,10 +181,10 @@ class GrobidProcessor(BaseProcessor):
187
  "text": f"authors: {biblio['authors']}",
188
  "type": passage_type,
189
  "section": "<header>",
190
- "subSection": "<title>",
191
- "passage_id": "htitle",
192
  "coordinates": ";".join([node['coords'] if coordinates and node.has_attr('coords') else "" for node in
193
- blocks_header['authors']])
194
  })
195
 
196
  passages.append({
@@ -293,7 +287,7 @@ class GrobidQuantitiesProcessor(BaseProcessor):
293
  def __init__(self, grobid_quantities_client):
294
  self.grobid_quantities_client = grobid_quantities_client
295
 
296
- def extract_quantities(self, text):
297
  status, result = self.grobid_quantities_client.process_text(text.strip())
298
 
299
  if status != 200:
@@ -465,7 +459,7 @@ class GrobidMaterialsProcessor(BaseProcessor):
465
  def __init__(self, grobid_superconductors_client):
466
  self.grobid_superconductors_client = grobid_superconductors_client
467
 
468
- def extract_materials(self, text):
469
  preprocessed_text = text.strip()
470
  status, result = self.grobid_superconductors_client.process_text(preprocessed_text,
471
  "processText_disable_linking")
@@ -568,17 +562,17 @@ class GrobidAggregationProcessor(GrobidProcessor, GrobidQuantitiesProcessor, Gro
568
  self.gmp = GrobidMaterialsProcessor(grobid_superconductors_client)
569
 
570
  def process_single_text(self, text):
571
- extracted_quantities_spans = self.gqp.extract_quantities(text)
572
- extracted_materials_spans = self.gmp.extract_materials(text)
573
  all_entities = extracted_quantities_spans + extracted_materials_spans
574
  entities = self.prune_overlapping_annotations(all_entities)
575
  return entities
576
 
577
- def extract_quantities(self, text):
578
- return self.gqp.extract_quantities(text)
579
 
580
- def extract_materials(self, text):
581
- return self.gmp.extract_materials(text)
582
 
583
  @staticmethod
584
  def box_to_dict(box, color=None, type=None):
@@ -715,8 +709,8 @@ class GrobidAggregationProcessor(GrobidProcessor, GrobidQuantitiesProcessor, Gro
715
 
716
 
717
  class XmlProcessor(BaseProcessor):
718
- def __init__(self, grobid_superconductors_client, grobid_quantities_client):
719
- super().__init__(grobid_superconductors_client, grobid_quantities_client)
720
 
721
  def process_structure(self, input_file):
722
  text = ""
@@ -728,16 +722,16 @@ class XmlProcessor(BaseProcessor):
728
 
729
  return output_data
730
 
731
- def process_single(self, input_file):
732
- doc = self.process_structure(input_file)
733
-
734
- for paragraph in doc['passages']:
735
- entities = self.process_single_text(paragraph['text'])
736
- paragraph['spans'] = entities
737
-
738
- return doc
739
 
740
- def parse_xml(self, text):
741
  output_data = OrderedDict()
742
  soup = BeautifulSoup(text, 'xml')
743
  text_blocks_children = get_children_list_supermat(soup, verbose=False)
 
2
  from collections import OrderedDict
3
  from html import escape
4
  from pathlib import Path
5
+ from typing_extensions import deprecated
6
 
7
  import dateparser
8
  import grobid_tei_xml
 
55
  return annotated_text
56
 
57
 
58
+ @deprecated("Use GrobidQuantitiesProcessor.process() instead")
59
  def extract_quantities(client, x_all, column_text_index):
60
  # relevant_items = ['magnetic field strength', 'magnetic induction', 'maximum energy product',
61
  # "magnetic flux density", "magnetic flux"]
 
65
 
66
  for idx, example in tqdm(enumerate(x_all), desc="extract quantities"):
67
  text = example[column_text_index]
68
+ spans = GrobidQuantitiesProcessor(client).process(text)
69
 
70
  data_record = {
71
  "id": example[0],
 
80
  return output_data
81
 
82
 
83
+ @deprecated("Use GrobidMaterialsProcessor.process() instead")
84
  def extract_materials(client, x_all, column_text_index):
85
  output_data = []
86
 
87
  for idx, example in tqdm(enumerate(x_all), desc="extract materials"):
88
  text = example[column_text_index]
89
+ spans = GrobidMaterialsProcessor(client).process(text)
90
  data_record = {
91
  "id": example[0],
92
  "filename": example[1],
 
134
  # super().__init__()
135
  self.grobid_client = grobid_client
136
 
137
+ def process(self, input_path, coordinates=False):
138
  pdf_file, status, text = self.grobid_client.process_pdf("processFulltextDocument",
139
  input_path,
140
  consolidate_header=True,
 
148
  if status != 200:
149
  return
150
 
151
+ document_object = self.parse_grobid_xml(text, coordinates=coordinates)
152
+ document_object['filename'] = Path(pdf_file).stem.replace(".tei", "")
153
 
154
+ return document_object
 
 
 
 
 
 
 
 
 
155
 
156
  def parse_grobid_xml(self, text, coordinates=False):
157
  output_data = OrderedDict()
 
181
  "text": f"authors: {biblio['authors']}",
182
  "type": passage_type,
183
  "section": "<header>",
184
+ "subSection": "<authors>",
185
+ "passage_id": "hauthors",
186
  "coordinates": ";".join([node['coords'] if coordinates and node.has_attr('coords') else "" for node in
187
+ blocks_header['authors']])
188
  })
189
 
190
  passages.append({
 
287
  def __init__(self, grobid_quantities_client):
288
  self.grobid_quantities_client = grobid_quantities_client
289
 
290
+ def process(self, text):
291
  status, result = self.grobid_quantities_client.process_text(text.strip())
292
 
293
  if status != 200:
 
459
  def __init__(self, grobid_superconductors_client):
460
  self.grobid_superconductors_client = grobid_superconductors_client
461
 
462
+ def process(self, text):
463
  preprocessed_text = text.strip()
464
  status, result = self.grobid_superconductors_client.process_text(preprocessed_text,
465
  "processText_disable_linking")
 
562
  self.gmp = GrobidMaterialsProcessor(grobid_superconductors_client)
563
 
564
  def process_single_text(self, text):
565
+ extracted_quantities_spans = self.process_properties(text)
566
+ extracted_materials_spans = self.process_materials(text)
567
  all_entities = extracted_quantities_spans + extracted_materials_spans
568
  entities = self.prune_overlapping_annotations(all_entities)
569
  return entities
570
 
571
+ def process_properties(self, text):
572
+ return self.gqp.process(text)
573
 
574
+ def process_materials(self, text):
575
+ return self.gmp.process(text)
576
 
577
  @staticmethod
578
  def box_to_dict(box, color=None, type=None):
 
709
 
710
 
711
  class XmlProcessor(BaseProcessor):
712
+ def __init__(self):
713
+ super().__init__()
714
 
715
  def process_structure(self, input_file):
716
  text = ""
 
722
 
723
  return output_data
724
 
725
+ # def process_single(self, input_file):
726
+ # doc = self.process_structure(input_file)
727
+ #
728
+ # for paragraph in doc['passages']:
729
+ # entities = self.process_single_text(paragraph['text'])
730
+ # paragraph['spans'] = entities
731
+ #
732
+ # return doc
733
 
734
+ def process(self, text):
735
  output_data = OrderedDict()
736
  soup = BeautifulSoup(text, 'xml')
737
  text_blocks_children = get_children_list_supermat(soup, verbose=False)