Spaces:
Running
Running
fix dependencies
Browse files- grobid_processors.py +19 -4
grobid_processors.py
CHANGED
@@ -8,8 +8,6 @@ import grobid_tei_xml
|
|
8 |
from bs4 import BeautifulSoup
|
9 |
from tqdm import tqdm
|
10 |
|
11 |
-
from commons import supermat_tei_parser
|
12 |
-
|
13 |
|
14 |
def get_span_start(type, title=None):
|
15 |
title_ = ' title="' + title + '"' if title is not None else ""
|
@@ -659,7 +657,7 @@ class XmlProcessor(BaseProcessor):
|
|
659 |
def parse_xml(self, text):
|
660 |
output_data = OrderedDict()
|
661 |
soup = BeautifulSoup(text, 'xml')
|
662 |
-
text_blocks_children =
|
663 |
|
664 |
passages = []
|
665 |
output_data['passages'] = passages
|
@@ -680,8 +678,25 @@ class XmlProcessor(BaseProcessor):
|
|
680 |
|
681 |
return output_data
|
682 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
683 |
|
684 |
-
def
|
685 |
children = []
|
686 |
|
687 |
child_name = "p" if use_paragraphs else "s"
|
|
|
8 |
from bs4 import BeautifulSoup
|
9 |
from tqdm import tqdm
|
10 |
|
|
|
|
|
11 |
|
12 |
def get_span_start(type, title=None):
|
13 |
title_ = ' title="' + title + '"' if title is not None else ""
|
|
|
657 |
def parse_xml(self, text):
|
658 |
output_data = OrderedDict()
|
659 |
soup = BeautifulSoup(text, 'xml')
|
660 |
+
text_blocks_children = get_children_list_supermat(soup, verbose=False)
|
661 |
|
662 |
passages = []
|
663 |
output_data['passages'] = passages
|
|
|
678 |
|
679 |
return output_data
|
680 |
|
681 |
+
def get_children_list_supermat(soup, use_paragraphs=False, verbose=False):
|
682 |
+
children = []
|
683 |
+
|
684 |
+
child_name = "p" if use_paragraphs else "s"
|
685 |
+
for child in soup.tei.children:
|
686 |
+
if child.name == 'teiHeader':
|
687 |
+
pass
|
688 |
+
children.append(child.find_all("title"))
|
689 |
+
children.extend([subchild.find_all(child_name) for subchild in child.find_all("abstract")])
|
690 |
+
children.extend([subchild.find_all(child_name) for subchild in child.find_all("ab", {"type": "keywords"})])
|
691 |
+
elif child.name == 'text':
|
692 |
+
children.extend([subchild.find_all(child_name) for subchild in child.find_all("body")])
|
693 |
+
|
694 |
+
if verbose:
|
695 |
+
print(str(children))
|
696 |
+
|
697 |
+
return children
|
698 |
|
699 |
+
def get_children_list_grobid(soup: object, use_paragraphs: object = True, verbose: object = False) -> object:
|
700 |
children = []
|
701 |
|
702 |
child_name = "p" if use_paragraphs else "s"
|