lfoppiano commited on
Commit
88c017f
2 Parent(s): 9c16287 168d47b

Merge branch 'main' into question-coefficient

Browse files
document_qa/document_qa_engine.py CHANGED
@@ -494,7 +494,7 @@ class DocumentQAEngine:
494
  print("File", pdf_file_path)
495
  filename = Path(pdf_file_path).stem
496
  coordinates = True # if chunk_size == -1 else False
497
- structure = self.grobid_processor.process_structure(pdf_file_path, coordinates=coordinates)
498
 
499
  biblio = structure['biblio']
500
  biblio['filename'] = filename.replace(" ", "_")
 
494
  print("File", pdf_file_path)
495
  filename = Path(pdf_file_path).stem
496
  coordinates = True # if chunk_size == -1 else False
497
+ structure = self.grobid_processor.process(pdf_file_path, coordinates=coordinates)
498
 
499
  biblio = structure['biblio']
500
  biblio['filename'] = filename.replace(" ", "_")
document_qa/grobid_processors.py CHANGED
@@ -110,10 +110,10 @@ class GrobidProcessor(BaseProcessor):
110
  if status != 200:
111
  return
112
 
113
- output_data = self.parse_grobid_xml(text, coordinates=coordinates)
114
- output_data['filename'] = Path(pdf_file).stem.replace(".tei", "")
115
 
116
- return output_data
117
 
118
  def process_single(self, input_file):
119
  doc = self.process_structure(input_file)
@@ -152,8 +152,8 @@ class GrobidProcessor(BaseProcessor):
152
  "text": f"authors: {biblio['authors']}",
153
  "type": passage_type,
154
  "section": "<header>",
155
- "subSection": "<title>",
156
- "passage_id": "htitle",
157
  "coordinates": ";".join([node['coords'] if coordinates and node.has_attr('coords') else "" for node in
158
  blocks_header['authors']])
159
  })
@@ -258,7 +258,7 @@ class GrobidQuantitiesProcessor(BaseProcessor):
258
  def __init__(self, grobid_quantities_client):
259
  self.grobid_quantities_client = grobid_quantities_client
260
 
261
- def extract_quantities(self, text) -> list:
262
  status, result = self.grobid_quantities_client.process_text(text.strip())
263
 
264
  if status != 200:
@@ -430,7 +430,7 @@ class GrobidMaterialsProcessor(BaseProcessor):
430
  def __init__(self, grobid_superconductors_client):
431
  self.grobid_superconductors_client = grobid_superconductors_client
432
 
433
- def extract_materials(self, text):
434
  preprocessed_text = text.strip()
435
  status, result = self.grobid_superconductors_client.process_text(preprocessed_text,
436
  "processText_disable_linking")
@@ -534,22 +534,21 @@ class GrobidAggregationProcessor(GrobidQuantitiesProcessor, GrobidMaterialsProce
534
  self.gmp = GrobidMaterialsProcessor(grobid_superconductors_client)
535
 
536
  def process_single_text(self, text):
537
- extracted_quantities_spans = self.gqp.extract_quantities(text)
538
- extracted_materials_spans = self.gmp.extract_materials(text)
539
  all_entities = extracted_quantities_spans + extracted_materials_spans
540
  entities = self.prune_overlapping_annotations(all_entities)
541
  return entities
542
 
543
- def extract_quantities(self, text):
544
  if self.gqp:
545
- return self.gqp.extract_quantities(text)
546
  else:
547
  return []
548
 
549
-
550
- def extract_materials(self, text):
551
  if self.gmp:
552
- return self.gmp.extract_materials(text)
553
  else:
554
  return []
555
 
@@ -688,8 +687,8 @@ class GrobidAggregationProcessor(GrobidQuantitiesProcessor, GrobidMaterialsProce
688
 
689
 
690
  class XmlProcessor(BaseProcessor):
691
- def __init__(self, grobid_superconductors_client, grobid_quantities_client):
692
- super().__init__(grobid_superconductors_client, grobid_quantities_client)
693
 
694
  def process_structure(self, input_file):
695
  text = ""
@@ -701,16 +700,16 @@ class XmlProcessor(BaseProcessor):
701
 
702
  return output_data
703
 
704
- def process_single(self, input_file):
705
- doc = self.process_structure(input_file)
706
-
707
- for paragraph in doc['passages']:
708
- entities = self.process_single_text(paragraph['text'])
709
- paragraph['spans'] = entities
710
-
711
- return doc
712
 
713
- def parse_xml(self, text):
714
  output_data = OrderedDict()
715
  soup = BeautifulSoup(text, 'xml')
716
  text_blocks_children = get_children_list_supermat(soup, verbose=False)
 
110
  if status != 200:
111
  return
112
 
113
+ document_object = self.parse_grobid_xml(text, coordinates=coordinates)
114
+ document_object['filename'] = Path(pdf_file).stem.replace(".tei", "")
115
 
116
+ return document_object
117
 
118
  def process_single(self, input_file):
119
  doc = self.process_structure(input_file)
 
152
  "text": f"authors: {biblio['authors']}",
153
  "type": passage_type,
154
  "section": "<header>",
155
+ "subSection": "<authors>",
156
+ "passage_id": "hauthors",
157
  "coordinates": ";".join([node['coords'] if coordinates and node.has_attr('coords') else "" for node in
158
  blocks_header['authors']])
159
  })
 
258
  def __init__(self, grobid_quantities_client):
259
  self.grobid_quantities_client = grobid_quantities_client
260
 
261
+ def process(self, text) -> list:
262
  status, result = self.grobid_quantities_client.process_text(text.strip())
263
 
264
  if status != 200:
 
430
  def __init__(self, grobid_superconductors_client):
431
  self.grobid_superconductors_client = grobid_superconductors_client
432
 
433
+ def process(self, text):
434
  preprocessed_text = text.strip()
435
  status, result = self.grobid_superconductors_client.process_text(preprocessed_text,
436
  "processText_disable_linking")
 
534
  self.gmp = GrobidMaterialsProcessor(grobid_superconductors_client)
535
 
536
  def process_single_text(self, text):
537
+ extracted_quantities_spans = self.process_properties(text)
538
+ extracted_materials_spans = self.process_materials(text)
539
  all_entities = extracted_quantities_spans + extracted_materials_spans
540
  entities = self.prune_overlapping_annotations(all_entities)
541
  return entities
542
 
543
+ def process_properties(self, text):
544
  if self.gqp:
545
+ return self.gqp.process(text)
546
  else:
547
  return []
548
 
549
+ def process_materials(self, text):
 
550
  if self.gmp:
551
+ return self.gmp.process(text)
552
  else:
553
  return []
554
 
 
687
 
688
 
689
  class XmlProcessor(BaseProcessor):
690
+ def __init__(self):
691
+ super().__init__()
692
 
693
  def process_structure(self, input_file):
694
  text = ""
 
700
 
701
  return output_data
702
 
703
+ # def process_single(self, input_file):
704
+ # doc = self.process_structure(input_file)
705
+ #
706
+ # for paragraph in doc['passages']:
707
+ # entities = self.process_single_text(paragraph['text'])
708
+ # paragraph['spans'] = entities
709
+ #
710
+ # return doc
711
 
712
+ def process(self, text):
713
  output_data = OrderedDict()
714
  soup = BeautifulSoup(text, 'xml')
715
  text_blocks_children = get_children_list_supermat(soup, verbose=False)