Spaces:
Running
Running
Luca Foppiano
commited on
refactor grobid processors (#29)
Browse filesrefactor grobid processors, deprecate legacy methods
document_qa/document_qa_engine.py
CHANGED
@@ -269,7 +269,7 @@ class DocumentQAEngine:
|
|
269 |
print("File", pdf_file_path)
|
270 |
filename = Path(pdf_file_path).stem
|
271 |
coordinates = True # if chunk_size == -1 else False
|
272 |
-
structure = self.grobid_processor.
|
273 |
|
274 |
biblio = structure['biblio']
|
275 |
biblio['filename'] = filename.replace(" ", "_")
|
|
|
269 |
print("File", pdf_file_path)
|
270 |
filename = Path(pdf_file_path).stem
|
271 |
coordinates = True # if chunk_size == -1 else False
|
272 |
+
structure = self.grobid_processor.process(pdf_file_path, coordinates=coordinates)
|
273 |
|
274 |
biblio = structure['biblio']
|
275 |
biblio['filename'] = filename.replace(" ", "_")
|
document_qa/grobid_processors.py
CHANGED
@@ -2,6 +2,7 @@ import re
|
|
2 |
from collections import OrderedDict
|
3 |
from html import escape
|
4 |
from pathlib import Path
|
|
|
5 |
|
6 |
import dateparser
|
7 |
import grobid_tei_xml
|
@@ -54,6 +55,7 @@ def decorate_text_with_annotations(text, spans, tag="span"):
|
|
54 |
return annotated_text
|
55 |
|
56 |
|
|
|
57 |
def extract_quantities(client, x_all, column_text_index):
|
58 |
# relevant_items = ['magnetic field strength', 'magnetic induction', 'maximum energy product',
|
59 |
# "magnetic flux density", "magnetic flux"]
|
@@ -63,7 +65,7 @@ def extract_quantities(client, x_all, column_text_index):
|
|
63 |
|
64 |
for idx, example in tqdm(enumerate(x_all), desc="extract quantities"):
|
65 |
text = example[column_text_index]
|
66 |
-
spans = GrobidQuantitiesProcessor(client).
|
67 |
|
68 |
data_record = {
|
69 |
"id": example[0],
|
@@ -78,12 +80,13 @@ def extract_quantities(client, x_all, column_text_index):
|
|
78 |
return output_data
|
79 |
|
80 |
|
|
|
81 |
def extract_materials(client, x_all, column_text_index):
|
82 |
output_data = []
|
83 |
|
84 |
for idx, example in tqdm(enumerate(x_all), desc="extract materials"):
|
85 |
text = example[column_text_index]
|
86 |
-
spans = GrobidMaterialsProcessor(client).
|
87 |
data_record = {
|
88 |
"id": example[0],
|
89 |
"filename": example[1],
|
@@ -131,7 +134,7 @@ class GrobidProcessor(BaseProcessor):
|
|
131 |
# super().__init__()
|
132 |
self.grobid_client = grobid_client
|
133 |
|
134 |
-
def
|
135 |
pdf_file, status, text = self.grobid_client.process_pdf("processFulltextDocument",
|
136 |
input_path,
|
137 |
consolidate_header=True,
|
@@ -145,19 +148,10 @@ class GrobidProcessor(BaseProcessor):
|
|
145 |
if status != 200:
|
146 |
return
|
147 |
|
148 |
-
|
149 |
-
|
150 |
|
151 |
-
return
|
152 |
-
|
153 |
-
def process_single(self, input_file):
|
154 |
-
doc = self.process_structure(input_file)
|
155 |
-
|
156 |
-
for paragraph in doc['passages']:
|
157 |
-
entities = self.process_single_text(paragraph['text'])
|
158 |
-
paragraph['spans'] = entities
|
159 |
-
|
160 |
-
return doc
|
161 |
|
162 |
def parse_grobid_xml(self, text, coordinates=False):
|
163 |
output_data = OrderedDict()
|
@@ -187,10 +181,10 @@ class GrobidProcessor(BaseProcessor):
|
|
187 |
"text": f"authors: {biblio['authors']}",
|
188 |
"type": passage_type,
|
189 |
"section": "<header>",
|
190 |
-
"subSection": "<
|
191 |
-
"passage_id": "
|
192 |
"coordinates": ";".join([node['coords'] if coordinates and node.has_attr('coords') else "" for node in
|
193 |
-
|
194 |
})
|
195 |
|
196 |
passages.append({
|
@@ -293,7 +287,7 @@ class GrobidQuantitiesProcessor(BaseProcessor):
|
|
293 |
def __init__(self, grobid_quantities_client):
|
294 |
self.grobid_quantities_client = grobid_quantities_client
|
295 |
|
296 |
-
def
|
297 |
status, result = self.grobid_quantities_client.process_text(text.strip())
|
298 |
|
299 |
if status != 200:
|
@@ -465,7 +459,7 @@ class GrobidMaterialsProcessor(BaseProcessor):
|
|
465 |
def __init__(self, grobid_superconductors_client):
|
466 |
self.grobid_superconductors_client = grobid_superconductors_client
|
467 |
|
468 |
-
def
|
469 |
preprocessed_text = text.strip()
|
470 |
status, result = self.grobid_superconductors_client.process_text(preprocessed_text,
|
471 |
"processText_disable_linking")
|
@@ -568,17 +562,17 @@ class GrobidAggregationProcessor(GrobidProcessor, GrobidQuantitiesProcessor, Gro
|
|
568 |
self.gmp = GrobidMaterialsProcessor(grobid_superconductors_client)
|
569 |
|
570 |
def process_single_text(self, text):
|
571 |
-
extracted_quantities_spans = self.
|
572 |
-
extracted_materials_spans = self.
|
573 |
all_entities = extracted_quantities_spans + extracted_materials_spans
|
574 |
entities = self.prune_overlapping_annotations(all_entities)
|
575 |
return entities
|
576 |
|
577 |
-
def
|
578 |
-
return self.gqp.
|
579 |
|
580 |
-
def
|
581 |
-
return self.gmp.
|
582 |
|
583 |
@staticmethod
|
584 |
def box_to_dict(box, color=None, type=None):
|
@@ -715,8 +709,8 @@ class GrobidAggregationProcessor(GrobidProcessor, GrobidQuantitiesProcessor, Gro
|
|
715 |
|
716 |
|
717 |
class XmlProcessor(BaseProcessor):
|
718 |
-
def __init__(self
|
719 |
-
super().__init__(
|
720 |
|
721 |
def process_structure(self, input_file):
|
722 |
text = ""
|
@@ -728,16 +722,16 @@ class XmlProcessor(BaseProcessor):
|
|
728 |
|
729 |
return output_data
|
730 |
|
731 |
-
def process_single(self, input_file):
|
732 |
-
|
733 |
-
|
734 |
-
|
735 |
-
|
736 |
-
|
737 |
-
|
738 |
-
|
739 |
|
740 |
-
def
|
741 |
output_data = OrderedDict()
|
742 |
soup = BeautifulSoup(text, 'xml')
|
743 |
text_blocks_children = get_children_list_supermat(soup, verbose=False)
|
|
|
2 |
from collections import OrderedDict
|
3 |
from html import escape
|
4 |
from pathlib import Path
|
5 |
+
from typing_extensions import deprecated
|
6 |
|
7 |
import dateparser
|
8 |
import grobid_tei_xml
|
|
|
55 |
return annotated_text
|
56 |
|
57 |
|
58 |
+
@deprecated("Use GrobidQuantitiesProcessor.process() instead")
|
59 |
def extract_quantities(client, x_all, column_text_index):
|
60 |
# relevant_items = ['magnetic field strength', 'magnetic induction', 'maximum energy product',
|
61 |
# "magnetic flux density", "magnetic flux"]
|
|
|
65 |
|
66 |
for idx, example in tqdm(enumerate(x_all), desc="extract quantities"):
|
67 |
text = example[column_text_index]
|
68 |
+
spans = GrobidQuantitiesProcessor(client).process(text)
|
69 |
|
70 |
data_record = {
|
71 |
"id": example[0],
|
|
|
80 |
return output_data
|
81 |
|
82 |
|
83 |
+
@deprecated("Use GrobidMaterialsProcessor.process() instead")
|
84 |
def extract_materials(client, x_all, column_text_index):
|
85 |
output_data = []
|
86 |
|
87 |
for idx, example in tqdm(enumerate(x_all), desc="extract materials"):
|
88 |
text = example[column_text_index]
|
89 |
+
spans = GrobidMaterialsProcessor(client).process(text)
|
90 |
data_record = {
|
91 |
"id": example[0],
|
92 |
"filename": example[1],
|
|
|
134 |
# super().__init__()
|
135 |
self.grobid_client = grobid_client
|
136 |
|
137 |
+
def process(self, input_path, coordinates=False):
|
138 |
pdf_file, status, text = self.grobid_client.process_pdf("processFulltextDocument",
|
139 |
input_path,
|
140 |
consolidate_header=True,
|
|
|
148 |
if status != 200:
|
149 |
return
|
150 |
|
151 |
+
document_object = self.parse_grobid_xml(text, coordinates=coordinates)
|
152 |
+
document_object['filename'] = Path(pdf_file).stem.replace(".tei", "")
|
153 |
|
154 |
+
return document_object
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
155 |
|
156 |
def parse_grobid_xml(self, text, coordinates=False):
|
157 |
output_data = OrderedDict()
|
|
|
181 |
"text": f"authors: {biblio['authors']}",
|
182 |
"type": passage_type,
|
183 |
"section": "<header>",
|
184 |
+
"subSection": "<authors>",
|
185 |
+
"passage_id": "hauthors",
|
186 |
"coordinates": ";".join([node['coords'] if coordinates and node.has_attr('coords') else "" for node in
|
187 |
+
blocks_header['authors']])
|
188 |
})
|
189 |
|
190 |
passages.append({
|
|
|
287 |
def __init__(self, grobid_quantities_client):
|
288 |
self.grobid_quantities_client = grobid_quantities_client
|
289 |
|
290 |
+
def process(self, text):
|
291 |
status, result = self.grobid_quantities_client.process_text(text.strip())
|
292 |
|
293 |
if status != 200:
|
|
|
459 |
def __init__(self, grobid_superconductors_client):
|
460 |
self.grobid_superconductors_client = grobid_superconductors_client
|
461 |
|
462 |
+
def process(self, text):
|
463 |
preprocessed_text = text.strip()
|
464 |
status, result = self.grobid_superconductors_client.process_text(preprocessed_text,
|
465 |
"processText_disable_linking")
|
|
|
562 |
self.gmp = GrobidMaterialsProcessor(grobid_superconductors_client)
|
563 |
|
564 |
def process_single_text(self, text):
|
565 |
+
extracted_quantities_spans = self.process_properties(text)
|
566 |
+
extracted_materials_spans = self.process_materials(text)
|
567 |
all_entities = extracted_quantities_spans + extracted_materials_spans
|
568 |
entities = self.prune_overlapping_annotations(all_entities)
|
569 |
return entities
|
570 |
|
571 |
+
def process_properties(self, text):
|
572 |
+
return self.gqp.process(text)
|
573 |
|
574 |
+
def process_materials(self, text):
|
575 |
+
return self.gmp.process(text)
|
576 |
|
577 |
@staticmethod
|
578 |
def box_to_dict(box, color=None, type=None):
|
|
|
709 |
|
710 |
|
711 |
class XmlProcessor(BaseProcessor):
|
712 |
+
def __init__(self):
|
713 |
+
super().__init__()
|
714 |
|
715 |
def process_structure(self, input_file):
|
716 |
text = ""
|
|
|
722 |
|
723 |
return output_data
|
724 |
|
725 |
+
# def process_single(self, input_file):
|
726 |
+
# doc = self.process_structure(input_file)
|
727 |
+
#
|
728 |
+
# for paragraph in doc['passages']:
|
729 |
+
# entities = self.process_single_text(paragraph['text'])
|
730 |
+
# paragraph['spans'] = entities
|
731 |
+
#
|
732 |
+
# return doc
|
733 |
|
734 |
+
def process(self, text):
|
735 |
output_data = OrderedDict()
|
736 |
soup = BeautifulSoup(text, 'xml')
|
737 |
text_blocks_children = get_children_list_supermat(soup, verbose=False)
|