import re
from collections import OrderedDict
from html import escape
from pathlib import Path
import dateparser
import grobid_tei_xml
from bs4 import BeautifulSoup
from tqdm import tqdm
def get_span_start(type, title=None):
title_ = ' title="' + title + '"' if title is not None else ""
return ''
def get_span_end():
return ''
def get_rs_start(type):
return ''
def get_rs_end():
return ''
def has_space_between_value_and_unit(quantity):
return quantity['offsetEnd'] < quantity['rawUnit']['offsetStart']
def decorate_text_with_annotations(text, spans, tag="span"):
"""
Decorate a text using spans, using two style defined by the tag:
- "span" generated HTML like annotated text
- "rs" generate XML like annotated text (format SuperMat)
"""
sorted_spans = list(sorted(spans, key=lambda item: item['offset_start']))
annotated_text = ""
start = 0
for span in sorted_spans:
type = span['type'].replace("<", "").replace(">", "")
if 'unit_type' in span and span['unit_type'] is not None:
type = span['unit_type'].replace(" ", "_")
annotated_text += escape(text[start: span['offset_start']])
title = span['quantified'] if 'quantified' in span else None
annotated_text += get_span_start(type, title) if tag == "span" else get_rs_start(type)
annotated_text += escape(text[span['offset_start']: span['offset_end']])
annotated_text += get_span_end() if tag == "span" else get_rs_end()
start = span['offset_end']
annotated_text += escape(text[start: len(text)])
return annotated_text
def extract_quantities(client, x_all, column_text_index):
# relevant_items = ['magnetic field strength', 'magnetic induction', 'maximum energy product',
# "magnetic flux density", "magnetic flux"]
# property_keywords = ['coercivity', 'remanence']
output_data = []
for idx, example in tqdm(enumerate(x_all), desc="extract quantities"):
text = example[column_text_index]
spans = GrobidQuantitiesProcessor(client).extract_quantities(text)
data_record = {
"id": example[0],
"filename": example[1],
"passage_id": example[2],
"text": text,
"spans": spans
}
output_data.append(data_record)
return output_data
def extract_materials(client, x_all, column_text_index):
output_data = []
for idx, example in tqdm(enumerate(x_all), desc="extract materials"):
text = example[column_text_index]
spans = GrobidMaterialsProcessor(client).extract_materials(text)
data_record = {
"id": example[0],
"filename": example[1],
"passage_id": example[2],
"text": text,
"spans": spans
}
output_data.append(data_record)
return output_data
def get_parsed_value_type(quantity):
if 'parsedValue' in quantity and 'structure' in quantity['parsedValue']:
return quantity['parsedValue']['structure']['type']
class BaseProcessor(object):
# def __init__(self, grobid_superconductors_client=None, grobid_quantities_client=None):
# self.grobid_superconductors_client = grobid_superconductors_client
# self.grobid_quantities_client = grobid_quantities_client
patterns = [
r'\d+e\d+'
]
def post_process(self, text):
output = text.replace('À', '-')
output = output.replace('¼', '=')
output = output.replace('þ', '+')
output = output.replace('Â', 'x')
output = output.replace('$', '~')
output = output.replace('−', '-')
output = output.replace('–', '-')
for pattern in self.patterns:
output = re.sub(pattern, lambda match: match.group().replace('e', '-'), output)
return output
class GrobidProcessor(BaseProcessor):
def __init__(self, grobid_client):
# super().__init__()
self.grobid_client = grobid_client
def process_structure(self, input_path, coordinates=False):
pdf_file, status, text = self.grobid_client.process_pdf("processFulltextDocument",
input_path,
consolidate_header=True,
consolidate_citations=False,
segment_sentences=False,
tei_coordinates=coordinates,
include_raw_citations=False,
include_raw_affiliations=False,
generateIDs=True)
if status != 200:
return
output_data = self.parse_grobid_xml(text, coordinates=coordinates)
output_data['filename'] = Path(pdf_file).stem.replace(".tei", "")
return output_data
def process_single(self, input_file):
doc = self.process_structure(input_file)
for paragraph in doc['passages']:
entities = self.process_single_text(paragraph['text'])
paragraph['spans'] = entities
return doc
def parse_grobid_xml(self, text, coordinates=False):
output_data = OrderedDict()
doc_biblio = grobid_tei_xml.parse_document_xml(text)
biblio = {
"doi": doc_biblio.header.doi if doc_biblio.header.doi is not None else "",
"authors": ", ".join([author.full_name for author in doc_biblio.header.authors]),
"title": doc_biblio.header.title,
"hash": doc_biblio.pdf_md5
}
try:
year = dateparser.parse(doc_biblio.header.date).year
biblio["publication_year"] = year
except:
pass
output_data['biblio'] = biblio
passages = []
output_data['passages'] = passages
passage_type = "paragraph"
soup = BeautifulSoup(text, 'xml')
blocks_header = get_xml_nodes_header(soup, use_paragraphs=True)
passages.append({
"text": f"authors: {biblio['authors']}",
"type": passage_type,
"section": "",
"subSection": "",
"passage_id": "htitle",
"coordinates": ";".join([node['coords'] if coordinates and node.has_attr('coords') else "" for node in
blocks_header['authors']])
})
passages.append({
"text": self.post_process(" ".join([node.text for node in blocks_header['title']])),
"type": passage_type,
"section": "",
"subSection": "",
"passage_id": "htitle",
"coordinates": ";".join([node['coords'] if coordinates and node.has_attr('coords') else "" for node in
blocks_header['title']])
})
passages.append({
"text": self.post_process(
''.join(node.text for node in blocks_header['abstract'] for text in node.find_all(text=True) if
text.parent.name != "ref" or (
text.parent.name == "ref" and text.parent.attrs[
'type'] != 'bibr'))),
"type": passage_type,
"section": "",
"subSection": "",
"passage_id": "habstract",
"coordinates": ";".join([node['coords'] if coordinates and node.has_attr('coords') else "" for node in
blocks_header['abstract']])
})
text_blocks_body = get_xml_nodes_body(soup, verbose=False, use_paragraphs=True)
use_paragraphs = True
if not use_paragraphs:
passages.extend([
{
"text": self.post_process(''.join(text for text in sentence.find_all(text=True) if
text.parent.name != "ref" or (
text.parent.name == "ref" and text.parent.attrs[
'type'] != 'bibr'))),
"type": passage_type,
"section": "",
"subSection": "",
"passage_id": str(paragraph_id),
"coordinates": paragraph['coords'] if coordinates and sentence.has_attr('coords') else ""
}
for paragraph_id, paragraph in enumerate(text_blocks_body) for
sentence_id, sentence in enumerate(paragraph)
])
else:
passages.extend([
{
"text": self.post_process(''.join(text for text in paragraph.find_all(text=True) if
text.parent.name != "ref" or (
text.parent.name == "ref" and text.parent.attrs[
'type'] != 'bibr'))),
"type": passage_type,
"section": "",
"subSection": "",
"passage_id": str(paragraph_id),
"coordinates": paragraph['coords'] if coordinates and paragraph.has_attr('coords') else ""
}
for paragraph_id, paragraph in enumerate(text_blocks_body)
])
text_blocks_figures = get_xml_nodes_figures(soup, verbose=False)
if not use_paragraphs:
passages.extend([
{
"text": self.post_process(''.join(text for text in sentence.find_all(text=True) if
text.parent.name != "ref" or (
text.parent.name == "ref" and text.parent.attrs[
'type'] != 'bibr'))),
"type": passage_type,
"section": "",
"subSection": "