Spaces:
Running
Running
include only bibliographics data that have potentially coordinates
Browse files- document_qa/document_qa_engine.py +4 -16
- document_qa/grobid_processors.py +60 -31
- tests/test_grobid_processors.py +32 -6
document_qa/document_qa_engine.py
CHANGED
@@ -57,7 +57,7 @@ class DocumentQAEngine:
|
|
57 |
grobid_client = GrobidClient(
|
58 |
grobid_server=self.grobid_url,
|
59 |
batch_size=1000,
|
60 |
-
coordinates=["p"],
|
61 |
sleep_time=5,
|
62 |
timeout=60,
|
63 |
check_server=True
|
@@ -189,7 +189,7 @@ class DocumentQAEngine:
|
|
189 |
relevant_documents = multi_query_retriever.get_relevant_documents(query)
|
190 |
return relevant_documents
|
191 |
|
192 |
-
def get_text_from_document(self, pdf_file_path, chunk_size=-1, perc_overlap=0.1,
|
193 |
"""
|
194 |
Extract text from documents using Grobid, if chunk_size is < 0 it keeps each paragraph separately
|
195 |
"""
|
@@ -233,25 +233,13 @@ class DocumentQAEngine:
|
|
233 |
metadatas = [biblio for _ in range(len(texts))]
|
234 |
ids = [id for id, t in enumerate(texts)]
|
235 |
|
236 |
-
if "biblio" in include:
|
237 |
-
biblio_metadata = copy.copy(biblio)
|
238 |
-
biblio_metadata['type'] = "biblio"
|
239 |
-
biblio_metadata['section'] = "header"
|
240 |
-
for key in ['title', 'authors', 'publication_year']:
|
241 |
-
if key in biblio_metadata:
|
242 |
-
texts.append("{}: {}".format(key, biblio_metadata[key]))
|
243 |
-
metadatas.append(biblio_metadata)
|
244 |
-
ids.append(key)
|
245 |
-
|
246 |
return texts, metadatas, ids
|
247 |
|
248 |
-
def create_memory_embeddings(self, pdf_path, doc_id=None, chunk_size=500, perc_overlap=0.1
|
249 |
-
include = ["biblio"] if include_biblio else []
|
250 |
texts, metadata, ids = self.get_text_from_document(
|
251 |
pdf_path,
|
252 |
chunk_size=chunk_size,
|
253 |
-
perc_overlap=perc_overlap
|
254 |
-
include=include)
|
255 |
if doc_id:
|
256 |
hash = doc_id
|
257 |
else:
|
|
|
57 |
grobid_client = GrobidClient(
|
58 |
grobid_server=self.grobid_url,
|
59 |
batch_size=1000,
|
60 |
+
coordinates=["p", "title", "persName"],
|
61 |
sleep_time=5,
|
62 |
timeout=60,
|
63 |
check_server=True
|
|
|
189 |
relevant_documents = multi_query_retriever.get_relevant_documents(query)
|
190 |
return relevant_documents
|
191 |
|
192 |
+
def get_text_from_document(self, pdf_file_path, chunk_size=-1, perc_overlap=0.1, verbose=False):
|
193 |
"""
|
194 |
Extract text from documents using Grobid, if chunk_size is < 0 it keeps each paragraph separately
|
195 |
"""
|
|
|
233 |
metadatas = [biblio for _ in range(len(texts))]
|
234 |
ids = [id for id, t in enumerate(texts)]
|
235 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
236 |
return texts, metadatas, ids
|
237 |
|
238 |
+
def create_memory_embeddings(self, pdf_path, doc_id=None, chunk_size=500, perc_overlap=0.1):
|
|
|
239 |
texts, metadata, ids = self.get_text_from_document(
|
240 |
pdf_path,
|
241 |
chunk_size=chunk_size,
|
242 |
+
perc_overlap=perc_overlap)
|
|
|
243 |
if doc_id:
|
244 |
hash = doc_id
|
245 |
else:
|
document_qa/grobid_processors.py
CHANGED
@@ -176,32 +176,48 @@ class GrobidProcessor(BaseProcessor):
|
|
176 |
pass
|
177 |
|
178 |
output_data['biblio'] = biblio
|
179 |
-
|
180 |
passages = []
|
181 |
output_data['passages'] = passages
|
182 |
-
# if biblio['title'] is not None and len(biblio['title']) > 0:
|
183 |
-
# passages.append({
|
184 |
-
# "text": self.post_process(biblio['title']),
|
185 |
-
# "type": "paragraph",
|
186 |
-
# "section": "<header>",
|
187 |
-
# "subSection": "<title>",
|
188 |
-
# "passage_id": "title0"
|
189 |
-
# })
|
190 |
-
|
191 |
passage_type = "paragraph"
|
192 |
|
193 |
-
if doc_biblio.abstract is not None and len(doc_biblio.abstract) > 0:
|
194 |
-
passages.append({
|
195 |
-
"text": self.post_process(doc_biblio.abstract),
|
196 |
-
"type": passage_type,
|
197 |
-
"section": "<header>",
|
198 |
-
"subSection": "<abstract>",
|
199 |
-
"passage_id": "abstract0",
|
200 |
-
"coordinates": ""
|
201 |
-
})
|
202 |
-
|
203 |
soup = BeautifulSoup(text, 'xml')
|
204 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
205 |
|
206 |
use_paragraphs = True
|
207 |
if not use_paragraphs:
|
@@ -236,7 +252,7 @@ class GrobidProcessor(BaseProcessor):
|
|
236 |
for paragraph_id, paragraph in enumerate(text_blocks_body)
|
237 |
])
|
238 |
|
239 |
-
text_blocks_figures =
|
240 |
|
241 |
if not use_paragraphs:
|
242 |
passages.extend([
|
@@ -784,23 +800,36 @@ def get_children_list_grobid(soup: object, use_paragraphs: object = True, verbos
|
|
784 |
return children
|
785 |
|
786 |
|
787 |
-
def
|
788 |
-
|
789 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
790 |
for child in soup.TEI.children:
|
791 |
if child.name == 'text':
|
792 |
-
|
793 |
-
|
|
|
794 |
|
795 |
if verbose:
|
796 |
-
print(str(
|
797 |
|
798 |
-
return
|
799 |
|
800 |
|
801 |
-
def
|
802 |
children = []
|
803 |
-
child_name = "p" if use_paragraphs else "s"
|
804 |
for child in soup.TEI.children:
|
805 |
if child.name == 'text':
|
806 |
children.extend(
|
|
|
176 |
pass
|
177 |
|
178 |
output_data['biblio'] = biblio
|
|
|
179 |
passages = []
|
180 |
output_data['passages'] = passages
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
181 |
passage_type = "paragraph"
|
182 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
183 |
soup = BeautifulSoup(text, 'xml')
|
184 |
+
blocks_header = get_xml_nodes_header(soup, use_paragraphs=True)
|
185 |
+
|
186 |
+
passages.append({
|
187 |
+
"text": f"authors: {biblio['authors']}",
|
188 |
+
"type": passage_type,
|
189 |
+
"section": "<header>",
|
190 |
+
"subSection": "<title>",
|
191 |
+
"passage_id": "htitle",
|
192 |
+
"coordinates": ";".join([node['coords'] if coordinates and node.has_attr('coords') else "" for node in
|
193 |
+
blocks_header['authors']])
|
194 |
+
})
|
195 |
+
|
196 |
+
passages.append({
|
197 |
+
"text": self.post_process(" ".join([node.text for node in blocks_header['title']])),
|
198 |
+
"type": passage_type,
|
199 |
+
"section": "<header>",
|
200 |
+
"subSection": "<title>",
|
201 |
+
"passage_id": "htitle",
|
202 |
+
"coordinates": ";".join([node['coords'] if coordinates and node.has_attr('coords') else "" for node in
|
203 |
+
blocks_header['title']])
|
204 |
+
})
|
205 |
+
|
206 |
+
passages.append({
|
207 |
+
"text": self.post_process(
|
208 |
+
''.join(node.text for node in blocks_header['abstract'] for text in node.find_all(text=True) if
|
209 |
+
text.parent.name != "ref" or (
|
210 |
+
text.parent.name == "ref" and text.parent.attrs[
|
211 |
+
'type'] != 'bibr'))),
|
212 |
+
"type": passage_type,
|
213 |
+
"section": "<header>",
|
214 |
+
"subSection": "<abstract>",
|
215 |
+
"passage_id": "habstract",
|
216 |
+
"coordinates": ";".join([node['coords'] if coordinates and node.has_attr('coords') else "" for node in
|
217 |
+
blocks_header['abstract']])
|
218 |
+
})
|
219 |
+
|
220 |
+
text_blocks_body = get_xml_nodes_body(soup, verbose=False, use_paragraphs=True)
|
221 |
|
222 |
use_paragraphs = True
|
223 |
if not use_paragraphs:
|
|
|
252 |
for paragraph_id, paragraph in enumerate(text_blocks_body)
|
253 |
])
|
254 |
|
255 |
+
text_blocks_figures = get_xml_nodes_figures(soup, verbose=False)
|
256 |
|
257 |
if not use_paragraphs:
|
258 |
passages.extend([
|
|
|
800 |
return children
|
801 |
|
802 |
|
803 |
+
def get_xml_nodes_header(soup: object, use_paragraphs: bool = True) -> list:
|
804 |
+
sub_tag = "p" if use_paragraphs else "s"
|
805 |
+
|
806 |
+
header_elements = {
|
807 |
+
"authors": [persNameNode for persNameNode in soup.teiHeader.find_all("persName")],
|
808 |
+
"abstract": [p_in_abstract for abstractNodes in soup.teiHeader.find_all("abstract") for p_in_abstract in
|
809 |
+
abstractNodes.find_all(sub_tag)],
|
810 |
+
"title": [soup.teiHeader.fileDesc.title]
|
811 |
+
}
|
812 |
+
|
813 |
+
return header_elements
|
814 |
+
|
815 |
+
|
816 |
+
def get_xml_nodes_body(soup: object, use_paragraphs: bool = True, verbose: bool = False) -> list:
|
817 |
+
nodes = []
|
818 |
+
tag_name = "p" if use_paragraphs else "s"
|
819 |
for child in soup.TEI.children:
|
820 |
if child.name == 'text':
|
821 |
+
# nodes.extend([subchild.find_all(tag_name) for subchild in child.find_all("body")])
|
822 |
+
nodes.extend(
|
823 |
+
[subsubchild for subchild in child.find_all("body") for subsubchild in subchild.find_all(tag_name)])
|
824 |
|
825 |
if verbose:
|
826 |
+
print(str(nodes))
|
827 |
|
828 |
+
return nodes
|
829 |
|
830 |
|
831 |
+
def get_xml_nodes_figures(soup: object, verbose: bool = False) -> list:
|
832 |
children = []
|
|
|
833 |
for child in soup.TEI.children:
|
834 |
if child.name == 'text':
|
835 |
children.extend(
|
tests/test_grobid_processors.py
CHANGED
@@ -1,20 +1,46 @@
|
|
1 |
from bs4 import BeautifulSoup
|
2 |
-
from document_qa.grobid_processors import
|
3 |
|
4 |
|
5 |
-
def
|
6 |
with open("resources/2312.07559.paragraphs.tei.xml", 'r') as fo:
|
7 |
soup = BeautifulSoup(fo, 'xml')
|
8 |
|
9 |
-
|
10 |
|
11 |
-
assert len(
|
12 |
|
13 |
|
14 |
-
def
|
15 |
with open("resources/2312.07559.sentences.tei.xml", 'r') as fo:
|
16 |
soup = BeautifulSoup(fo, 'xml')
|
17 |
|
18 |
-
children =
|
19 |
|
20 |
assert len(children) == 327
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
from bs4 import BeautifulSoup
|
2 |
+
from document_qa.grobid_processors import get_xml_nodes_body, get_xml_nodes_figures, get_xml_nodes_header
|
3 |
|
4 |
|
5 |
+
def test_get_xml_nodes_body_paragraphs():
|
6 |
with open("resources/2312.07559.paragraphs.tei.xml", 'r') as fo:
|
7 |
soup = BeautifulSoup(fo, 'xml')
|
8 |
|
9 |
+
nodes = get_xml_nodes_body(soup, use_paragraphs=True)
|
10 |
|
11 |
+
assert len(nodes) == 70
|
12 |
|
13 |
|
14 |
+
def test_get_xml_nodes_body_sentences():
|
15 |
with open("resources/2312.07559.sentences.tei.xml", 'r') as fo:
|
16 |
soup = BeautifulSoup(fo, 'xml')
|
17 |
|
18 |
+
children = get_xml_nodes_body(soup, use_paragraphs=False)
|
19 |
|
20 |
assert len(children) == 327
|
21 |
+
|
22 |
+
|
23 |
+
def test_get_xml_nodes_figures():
|
24 |
+
with open("resources/2312.07559.paragraphs.tei.xml", 'r') as fo:
|
25 |
+
soup = BeautifulSoup(fo, 'xml')
|
26 |
+
|
27 |
+
children = get_xml_nodes_figures(soup)
|
28 |
+
|
29 |
+
assert len(children) == 13
|
30 |
+
|
31 |
+
|
32 |
+
def test_get_xml_nodes_header_paragraphs():
|
33 |
+
with open("resources/2312.07559.paragraphs.tei.xml", 'r') as fo:
|
34 |
+
soup = BeautifulSoup(fo, 'xml')
|
35 |
+
|
36 |
+
children = get_xml_nodes_header(soup)
|
37 |
+
|
38 |
+
assert len(children) == 8
|
39 |
+
|
40 |
+
def test_get_xml_nodes_header_sentences():
|
41 |
+
with open("resources/2312.07559.sentences.tei.xml", 'r') as fo:
|
42 |
+
soup = BeautifulSoup(fo, 'xml')
|
43 |
+
|
44 |
+
children = get_xml_nodes_header(soup, use_paragraphs=False)
|
45 |
+
|
46 |
+
assert len(children) == 15
|