Spaces:

lfoppiano
/

document-qa

Running

App Files Files Community

lfoppiano commited on Dec 18, 2023

Commit

5fd26bb

1 Parent(s): 9d4be7c

use paragraphs instead of sentences

Browse files

Files changed (9) hide show

document_qa/document_qa_engine.py +1 -1
document_qa/grobid_processors.py +86 -37
requirements.txt +1 -1
streamlit_app.py +4 -3
tests/__init__.py +0 -0
tests/conftest.py +37 -0
tests/resources/2312.07559.paragraphs.tei.xml +0 -0
tests/resources/2312.07559.sentences.tei.xml +0 -0
tests/test_grobid_processors.py +20 -0

document_qa/document_qa_engine.py CHANGED Viewed

@@ -56,7 +56,7 @@ class DocumentQAEngine:
             grobid_client = GrobidClient(
                 grobid_server=self.grobid_url,
                 batch_size=1000,
-                coordinates=["s"],
                 sleep_time=5,
                 timeout=60,
                 check_server=True

             grobid_client = GrobidClient(
                 grobid_server=self.grobid_url,
                 batch_size=1000,
+                coordinates=["p"],
                 sleep_time=5,
                 timeout=60,
                 check_server=True

document_qa/grobid_processors.py CHANGED Viewed

@@ -136,7 +136,7 @@ class GrobidProcessor(BaseProcessor):
                                                                 input_path,
                                                                 consolidate_header=True,
                                                                 consolidate_citations=False,
-                                                                segment_sentences=True,
                                                                 tei_coordinates=coordinates,
                                                                 include_raw_citations=False,
                                                                 include_raw_affiliations=False,
@@ -188,7 +188,7 @@ class GrobidProcessor(BaseProcessor):
         #         "passage_id": "title0"
         #     })
-        passage_type = "sentence" if coordinates else "paragraph"
         if doc_biblio.abstract is not None and len(doc_biblio.abstract) > 0:
             passages.append({
@@ -201,42 +201,74 @@ class GrobidProcessor(BaseProcessor):
             })
         soup = BeautifulSoup(text, 'xml')
-        text_blocks_body = get_children_body(soup, verbose=False, use_paragraphs=False)
-        passages.extend([
-            {
-                "text": self.post_process(''.join(text for text in sentence.find_all(text=True) if
-                                                  text.parent.name != "ref" or (
-                                                          text.parent.name == "ref" and text.parent.attrs[
-                                                      'type'] != 'bibr'))),
-                "type": passage_type,
-                "section": "<body>",
-                "subSection": "<sentence>",
-                "passage_id": str(paragraph_id) + str(sentence_id),
-                # "coordinates": sentence['coords'].split(";") if coordinates else []
-                "coordinates": sentence['coords'] if coordinates else ""
-            }
-            for paragraph_id, paragraph in enumerate(text_blocks_body) for
-            sentence_id, sentence in enumerate(paragraph)
-        ])
         text_blocks_figures = get_children_figures(soup, verbose=False)
-        passages.extend([
-            {
-                "text": self.post_process(''.join(text for text in sentence.find_all(text=True) if
-                                                  text.parent.name != "ref" or (
-                                                          text.parent.name == "ref" and text.parent.attrs[
-                                                      'type'] != 'bibr'))),
-                "type": passage_type,
-                "section": "<body>",
-                "subSection": "<figure>",
-                "passage_id": str(paragraph_id) + str(sentence_id),
-                "coordinates": sentence['coords'] if coordinates and 'coords' in sentence else ""
-            }
-            for paragraph_id, paragraph in enumerate(text_blocks_figures) for
-            sentence_id, sentence in enumerate(paragraph)
-        ])
         return output_data
@@ -532,6 +564,21 @@ class GrobidAggregationProcessor(GrobidProcessor, GrobidQuantitiesProcessor, Gro
     def extract_materials(self, text):
         return self.gmp.extract_materials(text)
     @staticmethod
     def prune_overlapping_annotations(entities: list) -> list:
         # Sorting by offsets
@@ -742,7 +789,8 @@ def get_children_body(soup: object, use_paragraphs: object = True, verbose: obje
     child_name = "p" if use_paragraphs else "s"
     for child in soup.TEI.children:
         if child.name == 'text':
-            children.extend([subchild.find_all(child_name) for subchild in child.find_all("body")])
     if verbose:
         print(str(children))
@@ -755,7 +803,8 @@ def get_children_figures(soup: object, use_paragraphs: object = True, verbose: o
     child_name = "p" if use_paragraphs else "s"
     for child in soup.TEI.children:
         if child.name == 'text':
-            children.extend([subchild.find_all("figDesc") for subchild in child.find_all("body")])
     if verbose:
         print(str(children))

                                                                 input_path,
                                                                 consolidate_header=True,
                                                                 consolidate_citations=False,
+                                                                segment_sentences=False,
                                                                 tei_coordinates=coordinates,
                                                                 include_raw_citations=False,
                                                                 include_raw_affiliations=False,
         #         "passage_id": "title0"
         #     })
+        passage_type = "paragraph"
         if doc_biblio.abstract is not None and len(doc_biblio.abstract) > 0:
             passages.append({
             })
         soup = BeautifulSoup(text, 'xml')
+        text_blocks_body = get_children_body(soup, verbose=False, use_paragraphs=True)
+        use_paragraphs = True
+        if not use_paragraphs:
+            passages.extend([
+                {
+                    "text": self.post_process(''.join(text for text in sentence.find_all(text=True) if
+                                                      text.parent.name != "ref" or (
+                                                              text.parent.name == "ref" and text.parent.attrs[
+                                                          'type'] != 'bibr'))),
+                    "type": passage_type,
+                    "section": "<body>",
+                    "subSection": "<paragraph>",
+                    "passage_id": str(paragraph_id),
+                    "coordinates": paragraph['coords'] if coordinates and sentence.has_attr('coords') else ""
+                }
+                for paragraph_id, paragraph in enumerate(text_blocks_body) for
+                sentence_id, sentence in enumerate(paragraph)
+            ])
+        else:
+            passages.extend([
+                {
+                    "text": self.post_process(''.join(text for text in paragraph.find_all(text=True) if
+                                                      text.parent.name != "ref" or (
+                                                              text.parent.name == "ref" and text.parent.attrs[
+                                                          'type'] != 'bibr'))),
+                    "type": passage_type,
+                    "section": "<body>",
+                    "subSection": "<paragraph>",
+                    "passage_id": str(paragraph_id),
+                    "coordinates": paragraph['coords'] if coordinates and paragraph.has_attr('coords') else ""
+                }
+                for paragraph_id, paragraph in enumerate(text_blocks_body)
+            ])
         text_blocks_figures = get_children_figures(soup, verbose=False)
+        if not use_paragraphs:
+            passages.extend([
+                {
+                    "text": self.post_process(''.join(text for text in sentence.find_all(text=True) if
+                                                      text.parent.name != "ref" or (
+                                                              text.parent.name == "ref" and text.parent.attrs[
+                                                          'type'] != 'bibr'))),
+                    "type": passage_type,
+                    "section": "<body>",
+                    "subSection": "<figure>",
+                    "passage_id": str(paragraph_id) + str(sentence_id),
+                    "coordinates": sentence['coords'] if coordinates and 'coords' in sentence else ""
+                }
+                for paragraph_id, paragraph in enumerate(text_blocks_figures) for
+                sentence_id, sentence in enumerate(paragraph)
+            ])
+        else:
+            passages.extend([
+                {
+                    "text": self.post_process(''.join(text for text in paragraph.find_all(text=True) if
+                                                      text.parent.name != "ref" or (
+                                                              text.parent.name == "ref" and text.parent.attrs[
+                                                          'type'] != 'bibr'))),
+                    "type": passage_type,
+                    "section": "<body>",
+                    "subSection": "<figure>",
+                    "passage_id": str(paragraph_id),
+                    "coordinates": paragraph['coords'] if coordinates and paragraph.has_attr('coords') else ""
+                }
+                for paragraph_id, paragraph in enumerate(text_blocks_figures)
+            ])
         return output_data
     def extract_materials(self, text):
         return self.gmp.extract_materials(text)
+    @staticmethod
+    def box_to_dict(box, color=None, type=None):
+        if box is None or box == "" or len(box) < 5:
+            return {}
+        item = {"page": box[0], "x": box[1], "y": box[2], "width": box[3], "height": box[4]}
+        if color is not None:
+            item['color'] = color
+        if type:
+            item['type'] = type
+        return item
     @staticmethod
     def prune_overlapping_annotations(entities: list) -> list:
         # Sorting by offsets
     child_name = "p" if use_paragraphs else "s"
     for child in soup.TEI.children:
         if child.name == 'text':
+            children.extend(
+                [subchild for subchild in child.find_all("body") for subchild in subchild.find_all(child_name)])
     if verbose:
         print(str(children))
     child_name = "p" if use_paragraphs else "s"
     for child in soup.TEI.children:
         if child.name == 'text':
+            children.extend(
+                [subchild for subchilds in child.find_all("body") for subchild in subchilds.find_all("figDesc")])
     if verbose:
         print(str(children))

requirements.txt CHANGED Viewed

@@ -7,7 +7,7 @@ grobid_tei_xml==0.1.3
 tqdm
 pyyaml==6.0
 pytest
-streamlit==1.27.2
 lxml
 Beautifulsoup4
 python-dotenv

 tqdm
 pyyaml==6.0
 pytest
+streamlit==1.29.0
 lxml
 Beautifulsoup4
 python-dotenv

streamlit_app.py CHANGED Viewed

@@ -296,7 +296,7 @@ with st.sidebar:
     mode = st.radio("Query mode", ("LLM", "Embeddings"), disabled=not uploaded_file, index=0, horizontal=True,
                     help="LLM will respond the question, Embedding will show the "
                          "paragraphs relevant to the question in the paper.")
-    chunk_size = st.slider("Chunks size", -1, 2000, value=250,
                            help="Size of chunks in which the document is partitioned",
                            disabled=uploaded_file is not None)
     context_size = st.slider("Context size", 3, 10, value=4,
@@ -410,8 +410,9 @@ with right_column:
                                                                                               st.session_state.doc_id,
                                                                                               context_size=context_size)
                 annotations = [
-                    {"page": coo[0], "x": coo[1], "y": coo[2], "width": coo[3], "height": coo[4], "color": "grey"} for coo in [c.split(",") for coord in
-                    coordinates for c in coord]]
                 gradients = generate_color_gradient(len(annotations))
                 for i, color in enumerate(gradients):
                     annotations[i]['color'] = color

     mode = st.radio("Query mode", ("LLM", "Embeddings"), disabled=not uploaded_file, index=0, horizontal=True,
                     help="LLM will respond the question, Embedding will show the "
                          "paragraphs relevant to the question in the paper.")
+    chunk_size = st.slider("Chunks size", -1, 2000, value=-1,
                            help="Size of chunks in which the document is partitioned",
                            disabled=uploaded_file is not None)
     context_size = st.slider("Context size", 3, 10, value=4,
                                                                                               st.session_state.doc_id,
                                                                                               context_size=context_size)
                 annotations = [
+                    GrobidAggregationProcessor.box_to_dict(coo) for coo in [c.split(",") for coord in
+                    coordinates for c in coord]
+                ]
                 gradients = generate_color_gradient(len(annotations))
                 for i, color in enumerate(gradients):
                     annotations[i]['color'] = color

tests/__init__.py ADDED Viewed

File without changes

tests/conftest.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import logging
+from pathlib import Path
+from unittest.mock import MagicMock
+import pytest
+from _pytest._py.path import LocalPath
+# derived from https://github.com/elifesciences/sciencebeam-trainer-delft/tree/develop/tests
+LOGGER = logging.getLogger(__name__)
+@pytest.fixture(scope='session', autouse=True)
+def setup_logging():
+    logging.root.handlers = []
+    logging.basicConfig(level='INFO')
+    logging.getLogger('tests').setLevel('DEBUG')
+    # logging.getLogger('sciencebeam_trainer_delft').setLevel('DEBUG')
+def _backport_assert_called(mock: MagicMock):
+    assert mock.called
+@pytest.fixture(scope='session', autouse=True)
+def patch_magicmock():
+    try:
+        MagicMock.assert_called
+    except AttributeError:
+        MagicMock.assert_called = _backport_assert_called
+@pytest.fixture
+def temp_dir(tmpdir: LocalPath):
+    # convert to standard Path
+    return Path(str(tmpdir))

tests/resources/2312.07559.paragraphs.tei.xml ADDED Viewed

The diff for this file is too large to render. See raw diff

tests/resources/2312.07559.sentences.tei.xml ADDED Viewed

The diff for this file is too large to render. See raw diff

tests/test_grobid_processors.py ADDED Viewed

	@@ -0,0 +1,20 @@

+from bs4 import BeautifulSoup
+from document_qa.grobid_processors import get_children_body
+def test_get_children_paragraphs():
+    with open("resources/2312.07559.paragraphs.tei.xml", 'r') as fo:
+        soup = BeautifulSoup(fo, 'xml')
+    children = get_children_body(soup, use_paragraphs=True)
+    assert len(children) == 70
+def test_get_children_sentences():
+    with open("resources/2312.07559.sentences.tei.xml", 'r') as fo:
+        soup = BeautifulSoup(fo, 'xml')
+    children = get_children_body(soup, use_paragraphs=False)
+    assert len(children) == 327