Spaces:

lfoppiano
/

document-qa

Running

App Files Files Community

lfoppiano commited on Jan 14, 2024

Commit

55de44e

1 Parent(s): b325c61

include only bibliographics data that have potentially coordinates

Browse files

Files changed (3) hide show

document_qa/document_qa_engine.py +4 -16
document_qa/grobid_processors.py +60 -31
tests/test_grobid_processors.py +32 -6

document_qa/document_qa_engine.py CHANGED Viewed

@@ -57,7 +57,7 @@ class DocumentQAEngine:
             grobid_client = GrobidClient(
                 grobid_server=self.grobid_url,
                 batch_size=1000,
-                coordinates=["p"],
                 sleep_time=5,
                 timeout=60,
                 check_server=True
@@ -189,7 +189,7 @@ class DocumentQAEngine:
         relevant_documents = multi_query_retriever.get_relevant_documents(query)
         return relevant_documents
-    def get_text_from_document(self, pdf_file_path, chunk_size=-1, perc_overlap=0.1, include=(), verbose=False):
         """
         Extract text from documents using Grobid, if chunk_size is < 0 it keeps each paragraph separately
         """
@@ -233,25 +233,13 @@ class DocumentQAEngine:
             metadatas = [biblio for _ in range(len(texts))]
             ids = [id for id, t in enumerate(texts)]
-        if "biblio" in include:
-            biblio_metadata = copy.copy(biblio)
-            biblio_metadata['type'] = "biblio"
-            biblio_metadata['section'] = "header"
-            for key in ['title', 'authors', 'publication_year']:
-                if key in biblio_metadata:
-                    texts.append("{}: {}".format(key, biblio_metadata[key]))
-                    metadatas.append(biblio_metadata)
-                    ids.append(key)
         return texts, metadatas, ids
-    def create_memory_embeddings(self, pdf_path, doc_id=None, chunk_size=500, perc_overlap=0.1, include_biblio=False):
-        include = ["biblio"] if include_biblio else []
         texts, metadata, ids = self.get_text_from_document(
             pdf_path,
             chunk_size=chunk_size,
-            perc_overlap=perc_overlap,
-            include=include)
         if doc_id:
             hash = doc_id
         else:

             grobid_client = GrobidClient(
                 grobid_server=self.grobid_url,
                 batch_size=1000,
+                coordinates=["p", "title", "persName"],
                 sleep_time=5,
                 timeout=60,
                 check_server=True
         relevant_documents = multi_query_retriever.get_relevant_documents(query)
         return relevant_documents
+    def get_text_from_document(self, pdf_file_path, chunk_size=-1, perc_overlap=0.1, verbose=False):
         """
         Extract text from documents using Grobid, if chunk_size is < 0 it keeps each paragraph separately
         """
             metadatas = [biblio for _ in range(len(texts))]
             ids = [id for id, t in enumerate(texts)]
         return texts, metadatas, ids
+    def create_memory_embeddings(self, pdf_path, doc_id=None, chunk_size=500, perc_overlap=0.1):
         texts, metadata, ids = self.get_text_from_document(
             pdf_path,
             chunk_size=chunk_size,
+            perc_overlap=perc_overlap)
         if doc_id:
             hash = doc_id
         else:

document_qa/grobid_processors.py CHANGED Viewed

@@ -176,32 +176,48 @@ class GrobidProcessor(BaseProcessor):
             pass
         output_data['biblio'] = biblio
         passages = []
         output_data['passages'] = passages
-        # if biblio['title'] is not None and len(biblio['title']) > 0:
-        #     passages.append({
-        #         "text": self.post_process(biblio['title']),
-        #         "type": "paragraph",
-        #         "section": "<header>",
-        #         "subSection": "<title>",
-        #         "passage_id": "title0"
-        #     })
         passage_type = "paragraph"
-        if doc_biblio.abstract is not None and len(doc_biblio.abstract) > 0:
-            passages.append({
-                "text": self.post_process(doc_biblio.abstract),
-                "type": passage_type,
-                "section": "<header>",
-                "subSection": "<abstract>",
-                "passage_id": "abstract0",
-                "coordinates": ""
-            })
         soup = BeautifulSoup(text, 'xml')
-        text_blocks_body = get_children_body(soup, verbose=False, use_paragraphs=True)
         use_paragraphs = True
         if not use_paragraphs:
@@ -236,7 +252,7 @@ class GrobidProcessor(BaseProcessor):
                 for paragraph_id, paragraph in enumerate(text_blocks_body)
             ])
-        text_blocks_figures = get_children_figures(soup, verbose=False)
         if not use_paragraphs:
             passages.extend([
@@ -784,23 +800,36 @@ def get_children_list_grobid(soup: object, use_paragraphs: object = True, verbos
     return children
-def get_children_body(soup: object, use_paragraphs: object = True, verbose: object = False) -> object:
-    children = []
-    child_name = "p" if use_paragraphs else "s"
     for child in soup.TEI.children:
         if child.name == 'text':
-            children.extend(
-                [subchild for subchild in child.find_all("body") for subchild in subchild.find_all(child_name)])
     if verbose:
-        print(str(children))
-    return children
-def get_children_figures(soup: object, use_paragraphs: object = True, verbose: object = False) -> object:
     children = []
-    child_name = "p" if use_paragraphs else "s"
     for child in soup.TEI.children:
         if child.name == 'text':
             children.extend(

             pass
         output_data['biblio'] = biblio
         passages = []
         output_data['passages'] = passages
         passage_type = "paragraph"
         soup = BeautifulSoup(text, 'xml')
+        blocks_header = get_xml_nodes_header(soup, use_paragraphs=True)
+        passages.append({
+            "text": f"authors: {biblio['authors']}",
+            "type": passage_type,
+            "section": "<header>",
+            "subSection": "<title>",
+            "passage_id": "htitle",
+            "coordinates": ";".join([node['coords'] if coordinates and node.has_attr('coords') else "" for node in
+                            blocks_header['authors']])
+        })
+        passages.append({
+            "text": self.post_process(" ".join([node.text for node in blocks_header['title']])),
+            "type": passage_type,
+            "section": "<header>",
+            "subSection": "<title>",
+            "passage_id": "htitle",
+            "coordinates": ";".join([node['coords'] if coordinates and node.has_attr('coords') else "" for node in
+                                     blocks_header['title']])
+        })
+        passages.append({
+            "text": self.post_process(
+                ''.join(node.text for node in blocks_header['abstract'] for text in node.find_all(text=True) if
+                        text.parent.name != "ref" or (
+                                text.parent.name == "ref" and text.parent.attrs[
+                            'type'] != 'bibr'))),
+            "type": passage_type,
+            "section": "<header>",
+            "subSection": "<abstract>",
+            "passage_id": "habstract",
+            "coordinates": ";".join([node['coords'] if coordinates and node.has_attr('coords') else "" for node in
+                                     blocks_header['abstract']])
+        })
+        text_blocks_body = get_xml_nodes_body(soup, verbose=False, use_paragraphs=True)
         use_paragraphs = True
         if not use_paragraphs:
                 for paragraph_id, paragraph in enumerate(text_blocks_body)
             ])
+        text_blocks_figures = get_xml_nodes_figures(soup, verbose=False)
         if not use_paragraphs:
             passages.extend([
     return children
+def get_xml_nodes_header(soup: object, use_paragraphs: bool = True) -> list:
+    sub_tag = "p" if use_paragraphs else "s"
+    header_elements = {
+        "authors": [persNameNode for persNameNode in soup.teiHeader.find_all("persName")],
+        "abstract": [p_in_abstract for abstractNodes in soup.teiHeader.find_all("abstract") for p_in_abstract in
+                     abstractNodes.find_all(sub_tag)],
+        "title": [soup.teiHeader.fileDesc.title]
+    }
+    return header_elements
+def get_xml_nodes_body(soup: object, use_paragraphs: bool = True, verbose: bool = False) -> list:
+    nodes = []
+    tag_name = "p" if use_paragraphs else "s"
     for child in soup.TEI.children:
         if child.name == 'text':
+            # nodes.extend([subchild.find_all(tag_name) for subchild in child.find_all("body")])
+            nodes.extend(
+                [subsubchild for subchild in child.find_all("body") for subsubchild in subchild.find_all(tag_name)])
     if verbose:
+        print(str(nodes))
+    return nodes
+def get_xml_nodes_figures(soup: object, verbose: bool = False) -> list:
     children = []
     for child in soup.TEI.children:
         if child.name == 'text':
             children.extend(

tests/test_grobid_processors.py CHANGED Viewed

@@ -1,20 +1,46 @@
 from bs4 import BeautifulSoup
-from document_qa.grobid_processors import get_children_body
-def test_get_children_paragraphs():
     with open("resources/2312.07559.paragraphs.tei.xml", 'r') as fo:
         soup = BeautifulSoup(fo, 'xml')
-    children = get_children_body(soup, use_paragraphs=True)
-    assert len(children) == 70
-def test_get_children_sentences():
     with open("resources/2312.07559.sentences.tei.xml", 'r') as fo:
         soup = BeautifulSoup(fo, 'xml')
-    children = get_children_body(soup, use_paragraphs=False)
     assert len(children) == 327

 from bs4 import BeautifulSoup
+from document_qa.grobid_processors import get_xml_nodes_body, get_xml_nodes_figures, get_xml_nodes_header
+def test_get_xml_nodes_body_paragraphs():
     with open("resources/2312.07559.paragraphs.tei.xml", 'r') as fo:
         soup = BeautifulSoup(fo, 'xml')
+    nodes = get_xml_nodes_body(soup, use_paragraphs=True)
+    assert len(nodes) == 70
+def test_get_xml_nodes_body_sentences():
     with open("resources/2312.07559.sentences.tei.xml", 'r') as fo:
         soup = BeautifulSoup(fo, 'xml')
+    children = get_xml_nodes_body(soup, use_paragraphs=False)
     assert len(children) == 327
+def test_get_xml_nodes_figures():
+    with open("resources/2312.07559.paragraphs.tei.xml", 'r') as fo:
+        soup = BeautifulSoup(fo, 'xml')
+    children = get_xml_nodes_figures(soup)
+    assert len(children) == 13
+def test_get_xml_nodes_header_paragraphs():
+    with open("resources/2312.07559.paragraphs.tei.xml", 'r') as fo:
+        soup = BeautifulSoup(fo, 'xml')
+    children = get_xml_nodes_header(soup)
+    assert len(children) == 8
+def test_get_xml_nodes_header_sentences():
+    with open("resources/2312.07559.sentences.tei.xml", 'r') as fo:
+        soup = BeautifulSoup(fo, 'xml')
+    children = get_xml_nodes_header(soup, use_paragraphs=False)
+    assert len(children) == 15