lfoppiano commited on
Commit
53c8deb
1 Parent(s): ffa83ea

get data availability statement as context for QA

Browse files
Files changed (1) hide show
  1. document_qa/grobid_processors.py +15 -0
document_qa/grobid_processors.py CHANGED
@@ -183,6 +183,7 @@ class GrobidProcessor(BaseProcessor):
183
  })
184
 
185
  text_blocks_body = get_xml_nodes_body(soup, verbose=False, use_paragraphs=True)
 
186
 
187
  use_paragraphs = True
188
  if not use_paragraphs:
@@ -800,6 +801,20 @@ def get_xml_nodes_body(soup: object, use_paragraphs: bool = True, verbose: bool
800
  return nodes
801
 
802
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
803
  def get_xml_nodes_figures(soup: object, verbose: bool = False) -> list:
804
  children = []
805
  for child in soup.TEI.children:
 
183
  })
184
 
185
  text_blocks_body = get_xml_nodes_body(soup, verbose=False, use_paragraphs=True)
186
+ text_blocks_body.extend(get_xml_nodes_back(soup, verbose=False, use_paragraphs=True))
187
 
188
  use_paragraphs = True
189
  if not use_paragraphs:
 
801
  return nodes
802
 
803
 
804
+ def get_xml_nodes_back(soup: object, use_paragraphs: bool = True, verbose: bool = False) -> list:
805
+ nodes = []
806
+ tag_name = "p" if use_paragraphs else "s"
807
+ for child in soup.TEI.children:
808
+ if child.name == 'text':
809
+ nodes.extend(
810
+ [subsubchild for subchild in child.find_all("back") for subsubchild in subchild.find_all(tag_name)])
811
+
812
+ if verbose:
813
+ print(str(nodes))
814
+
815
+ return nodes
816
+
817
+
818
  def get_xml_nodes_figures(soup: object, verbose: bool = False) -> list:
819
  children = []
820
  for child in soup.TEI.children: