lfoppiano commited on
Commit
55de44e
·
1 Parent(s): b325c61

include only bibliographics data that have potentially coordinates

Browse files
document_qa/document_qa_engine.py CHANGED
@@ -57,7 +57,7 @@ class DocumentQAEngine:
57
  grobid_client = GrobidClient(
58
  grobid_server=self.grobid_url,
59
  batch_size=1000,
60
- coordinates=["p"],
61
  sleep_time=5,
62
  timeout=60,
63
  check_server=True
@@ -189,7 +189,7 @@ class DocumentQAEngine:
189
  relevant_documents = multi_query_retriever.get_relevant_documents(query)
190
  return relevant_documents
191
 
192
- def get_text_from_document(self, pdf_file_path, chunk_size=-1, perc_overlap=0.1, include=(), verbose=False):
193
  """
194
  Extract text from documents using Grobid, if chunk_size is < 0 it keeps each paragraph separately
195
  """
@@ -233,25 +233,13 @@ class DocumentQAEngine:
233
  metadatas = [biblio for _ in range(len(texts))]
234
  ids = [id for id, t in enumerate(texts)]
235
 
236
- if "biblio" in include:
237
- biblio_metadata = copy.copy(biblio)
238
- biblio_metadata['type'] = "biblio"
239
- biblio_metadata['section'] = "header"
240
- for key in ['title', 'authors', 'publication_year']:
241
- if key in biblio_metadata:
242
- texts.append("{}: {}".format(key, biblio_metadata[key]))
243
- metadatas.append(biblio_metadata)
244
- ids.append(key)
245
-
246
  return texts, metadatas, ids
247
 
248
- def create_memory_embeddings(self, pdf_path, doc_id=None, chunk_size=500, perc_overlap=0.1, include_biblio=False):
249
- include = ["biblio"] if include_biblio else []
250
  texts, metadata, ids = self.get_text_from_document(
251
  pdf_path,
252
  chunk_size=chunk_size,
253
- perc_overlap=perc_overlap,
254
- include=include)
255
  if doc_id:
256
  hash = doc_id
257
  else:
 
57
  grobid_client = GrobidClient(
58
  grobid_server=self.grobid_url,
59
  batch_size=1000,
60
+ coordinates=["p", "title", "persName"],
61
  sleep_time=5,
62
  timeout=60,
63
  check_server=True
 
189
  relevant_documents = multi_query_retriever.get_relevant_documents(query)
190
  return relevant_documents
191
 
192
+ def get_text_from_document(self, pdf_file_path, chunk_size=-1, perc_overlap=0.1, verbose=False):
193
  """
194
  Extract text from documents using Grobid, if chunk_size is < 0 it keeps each paragraph separately
195
  """
 
233
  metadatas = [biblio for _ in range(len(texts))]
234
  ids = [id for id, t in enumerate(texts)]
235
 
 
 
 
 
 
 
 
 
 
 
236
  return texts, metadatas, ids
237
 
238
+ def create_memory_embeddings(self, pdf_path, doc_id=None, chunk_size=500, perc_overlap=0.1):
 
239
  texts, metadata, ids = self.get_text_from_document(
240
  pdf_path,
241
  chunk_size=chunk_size,
242
+ perc_overlap=perc_overlap)
 
243
  if doc_id:
244
  hash = doc_id
245
  else:
document_qa/grobid_processors.py CHANGED
@@ -176,32 +176,48 @@ class GrobidProcessor(BaseProcessor):
176
  pass
177
 
178
  output_data['biblio'] = biblio
179
-
180
  passages = []
181
  output_data['passages'] = passages
182
- # if biblio['title'] is not None and len(biblio['title']) > 0:
183
- # passages.append({
184
- # "text": self.post_process(biblio['title']),
185
- # "type": "paragraph",
186
- # "section": "<header>",
187
- # "subSection": "<title>",
188
- # "passage_id": "title0"
189
- # })
190
-
191
  passage_type = "paragraph"
192
 
193
- if doc_biblio.abstract is not None and len(doc_biblio.abstract) > 0:
194
- passages.append({
195
- "text": self.post_process(doc_biblio.abstract),
196
- "type": passage_type,
197
- "section": "<header>",
198
- "subSection": "<abstract>",
199
- "passage_id": "abstract0",
200
- "coordinates": ""
201
- })
202
-
203
  soup = BeautifulSoup(text, 'xml')
204
- text_blocks_body = get_children_body(soup, verbose=False, use_paragraphs=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
205
 
206
  use_paragraphs = True
207
  if not use_paragraphs:
@@ -236,7 +252,7 @@ class GrobidProcessor(BaseProcessor):
236
  for paragraph_id, paragraph in enumerate(text_blocks_body)
237
  ])
238
 
239
- text_blocks_figures = get_children_figures(soup, verbose=False)
240
 
241
  if not use_paragraphs:
242
  passages.extend([
@@ -784,23 +800,36 @@ def get_children_list_grobid(soup: object, use_paragraphs: object = True, verbos
784
  return children
785
 
786
 
787
- def get_children_body(soup: object, use_paragraphs: object = True, verbose: object = False) -> object:
788
- children = []
789
- child_name = "p" if use_paragraphs else "s"
 
 
 
 
 
 
 
 
 
 
 
 
 
790
  for child in soup.TEI.children:
791
  if child.name == 'text':
792
- children.extend(
793
- [subchild for subchild in child.find_all("body") for subchild in subchild.find_all(child_name)])
 
794
 
795
  if verbose:
796
- print(str(children))
797
 
798
- return children
799
 
800
 
801
- def get_children_figures(soup: object, use_paragraphs: object = True, verbose: object = False) -> object:
802
  children = []
803
- child_name = "p" if use_paragraphs else "s"
804
  for child in soup.TEI.children:
805
  if child.name == 'text':
806
  children.extend(
 
176
  pass
177
 
178
  output_data['biblio'] = biblio
 
179
  passages = []
180
  output_data['passages'] = passages
 
 
 
 
 
 
 
 
 
181
  passage_type = "paragraph"
182
 
 
 
 
 
 
 
 
 
 
 
183
  soup = BeautifulSoup(text, 'xml')
184
+ blocks_header = get_xml_nodes_header(soup, use_paragraphs=True)
185
+
186
+ passages.append({
187
+ "text": f"authors: {biblio['authors']}",
188
+ "type": passage_type,
189
+ "section": "<header>",
190
+ "subSection": "<title>",
191
+ "passage_id": "htitle",
192
+ "coordinates": ";".join([node['coords'] if coordinates and node.has_attr('coords') else "" for node in
193
+ blocks_header['authors']])
194
+ })
195
+
196
+ passages.append({
197
+ "text": self.post_process(" ".join([node.text for node in blocks_header['title']])),
198
+ "type": passage_type,
199
+ "section": "<header>",
200
+ "subSection": "<title>",
201
+ "passage_id": "htitle",
202
+ "coordinates": ";".join([node['coords'] if coordinates and node.has_attr('coords') else "" for node in
203
+ blocks_header['title']])
204
+ })
205
+
206
+ passages.append({
207
+ "text": self.post_process(
208
+ ''.join(node.text for node in blocks_header['abstract'] for text in node.find_all(text=True) if
209
+ text.parent.name != "ref" or (
210
+ text.parent.name == "ref" and text.parent.attrs[
211
+ 'type'] != 'bibr'))),
212
+ "type": passage_type,
213
+ "section": "<header>",
214
+ "subSection": "<abstract>",
215
+ "passage_id": "habstract",
216
+ "coordinates": ";".join([node['coords'] if coordinates and node.has_attr('coords') else "" for node in
217
+ blocks_header['abstract']])
218
+ })
219
+
220
+ text_blocks_body = get_xml_nodes_body(soup, verbose=False, use_paragraphs=True)
221
 
222
  use_paragraphs = True
223
  if not use_paragraphs:
 
252
  for paragraph_id, paragraph in enumerate(text_blocks_body)
253
  ])
254
 
255
+ text_blocks_figures = get_xml_nodes_figures(soup, verbose=False)
256
 
257
  if not use_paragraphs:
258
  passages.extend([
 
800
  return children
801
 
802
 
803
+ def get_xml_nodes_header(soup: object, use_paragraphs: bool = True) -> list:
804
+ sub_tag = "p" if use_paragraphs else "s"
805
+
806
+ header_elements = {
807
+ "authors": [persNameNode for persNameNode in soup.teiHeader.find_all("persName")],
808
+ "abstract": [p_in_abstract for abstractNodes in soup.teiHeader.find_all("abstract") for p_in_abstract in
809
+ abstractNodes.find_all(sub_tag)],
810
+ "title": [soup.teiHeader.fileDesc.title]
811
+ }
812
+
813
+ return header_elements
814
+
815
+
816
+ def get_xml_nodes_body(soup: object, use_paragraphs: bool = True, verbose: bool = False) -> list:
817
+ nodes = []
818
+ tag_name = "p" if use_paragraphs else "s"
819
  for child in soup.TEI.children:
820
  if child.name == 'text':
821
+ # nodes.extend([subchild.find_all(tag_name) for subchild in child.find_all("body")])
822
+ nodes.extend(
823
+ [subsubchild for subchild in child.find_all("body") for subsubchild in subchild.find_all(tag_name)])
824
 
825
  if verbose:
826
+ print(str(nodes))
827
 
828
+ return nodes
829
 
830
 
831
+ def get_xml_nodes_figures(soup: object, verbose: bool = False) -> list:
832
  children = []
 
833
  for child in soup.TEI.children:
834
  if child.name == 'text':
835
  children.extend(
tests/test_grobid_processors.py CHANGED
@@ -1,20 +1,46 @@
1
  from bs4 import BeautifulSoup
2
- from document_qa.grobid_processors import get_children_body
3
 
4
 
5
- def test_get_children_paragraphs():
6
  with open("resources/2312.07559.paragraphs.tei.xml", 'r') as fo:
7
  soup = BeautifulSoup(fo, 'xml')
8
 
9
- children = get_children_body(soup, use_paragraphs=True)
10
 
11
- assert len(children) == 70
12
 
13
 
14
- def test_get_children_sentences():
15
  with open("resources/2312.07559.sentences.tei.xml", 'r') as fo:
16
  soup = BeautifulSoup(fo, 'xml')
17
 
18
- children = get_children_body(soup, use_paragraphs=False)
19
 
20
  assert len(children) == 327
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from bs4 import BeautifulSoup
2
+ from document_qa.grobid_processors import get_xml_nodes_body, get_xml_nodes_figures, get_xml_nodes_header
3
 
4
 
5
+ def test_get_xml_nodes_body_paragraphs():
6
  with open("resources/2312.07559.paragraphs.tei.xml", 'r') as fo:
7
  soup = BeautifulSoup(fo, 'xml')
8
 
9
+ nodes = get_xml_nodes_body(soup, use_paragraphs=True)
10
 
11
+ assert len(nodes) == 70
12
 
13
 
14
+ def test_get_xml_nodes_body_sentences():
15
  with open("resources/2312.07559.sentences.tei.xml", 'r') as fo:
16
  soup = BeautifulSoup(fo, 'xml')
17
 
18
+ children = get_xml_nodes_body(soup, use_paragraphs=False)
19
 
20
  assert len(children) == 327
21
+
22
+
23
+ def test_get_xml_nodes_figures():
24
+ with open("resources/2312.07559.paragraphs.tei.xml", 'r') as fo:
25
+ soup = BeautifulSoup(fo, 'xml')
26
+
27
+ children = get_xml_nodes_figures(soup)
28
+
29
+ assert len(children) == 13
30
+
31
+
32
+ def test_get_xml_nodes_header_paragraphs():
33
+ with open("resources/2312.07559.paragraphs.tei.xml", 'r') as fo:
34
+ soup = BeautifulSoup(fo, 'xml')
35
+
36
+ children = get_xml_nodes_header(soup)
37
+
38
+ assert len(children) == 8
39
+
40
+ def test_get_xml_nodes_header_sentences():
41
+ with open("resources/2312.07559.sentences.tei.xml", 'r') as fo:
42
+ soup = BeautifulSoup(fo, 'xml')
43
+
44
+ children = get_xml_nodes_header(soup, use_paragraphs=False)
45
+
46
+ assert len(children) == 15