|
import json |
|
import os |
|
import time |
|
|
|
import requests |
|
import yaml |
|
|
|
from client import ApiClient |
|
|
|
''' |
|
This client is a generic client for any Grobid application and sub-modules. |
|
At the moment, it supports only single document processing. |
|
|
|
Source: https://github.com/kermitt2/grobid-client-python |
|
''' |
|
|
|
|
|
class GrobidClientGeneric(ApiClient): |
|
|
|
def __init__(self, config_path=None, ping=False): |
|
self.config = None |
|
if config_path is not None: |
|
self.config = self.load_yaml_config_from_file(path=config_path) |
|
super().__init__(self.config['grobid']['server']) |
|
|
|
if ping: |
|
result = self.ping_grobid() |
|
if not result: |
|
raise Exception("Grobid is down.") |
|
|
|
os.environ['NO_PROXY'] = "nims.go.jp" |
|
|
|
@staticmethod |
|
def load_json_config_from_file(self, path='./config.json', ping=False): |
|
""" |
|
Load the json configuration |
|
""" |
|
config = {} |
|
with open(path, 'r') as fp: |
|
config = json.load(fp) |
|
|
|
if ping: |
|
result = self.ping_grobid() |
|
if not result: |
|
raise Exception("Grobid is down.") |
|
|
|
return config |
|
|
|
def load_yaml_config_from_file(self, path='./config.yaml'): |
|
""" |
|
Load the YAML configuration |
|
""" |
|
config = {} |
|
try: |
|
with open(path, 'r') as the_file: |
|
raw_configuration = the_file.read() |
|
|
|
config = yaml.safe_load(raw_configuration) |
|
except Exception as e: |
|
print("Configuration could not be loaded: ", str(e)) |
|
exit(1) |
|
|
|
return config |
|
|
|
def set_config(self, config, ping=False): |
|
self.config = config |
|
if ping: |
|
try: |
|
result = self.ping_grobid() |
|
if not result: |
|
raise Exception("Grobid is down.") |
|
except Exception as e: |
|
raise Exception("Grobid is down or other problems were encountered. ", e) |
|
|
|
def ping_grobid(self): |
|
|
|
ping_url = self.get_grobid_url("ping") |
|
|
|
r = requests.get(ping_url) |
|
status = r.status_code |
|
|
|
if status != 200: |
|
print('GROBID server does not appear up and running ' + str(status)) |
|
return False |
|
else: |
|
print("GROBID server is up and running") |
|
return True |
|
|
|
def get_grobid_url(self, action): |
|
grobid_config = self.config['grobid'] |
|
base_url = grobid_config['server'] |
|
action_url = base_url + grobid_config['url_mapping'][action] |
|
|
|
return action_url |
|
|
|
def process_texts(self, input, method_name='superconductors', params={}, headers={"Accept": "application/json"}): |
|
|
|
files = { |
|
'texts': input |
|
} |
|
|
|
the_url = self.get_grobid_url(method_name) |
|
params, the_url = self.get_params_from_url(the_url) |
|
|
|
res, status = self.post( |
|
url=the_url, |
|
files=files, |
|
data=params, |
|
headers=headers |
|
) |
|
|
|
if status == 503: |
|
time.sleep(self.config['sleep_time']) |
|
return self.process_texts(input, method_name, params, headers) |
|
elif status != 200: |
|
print('Processing failed with error ' + str(status)) |
|
return status, None |
|
else: |
|
return status, json.loads(res.text) |
|
|
|
def process_text(self, input, method_name='superconductors', params={}, headers={"Accept": "application/json"}): |
|
|
|
files = { |
|
'text': input |
|
} |
|
|
|
the_url = self.get_grobid_url(method_name) |
|
params, the_url = self.get_params_from_url(the_url) |
|
|
|
res, status = self.post( |
|
url=the_url, |
|
files=files, |
|
data=params, |
|
headers=headers |
|
) |
|
|
|
if status == 503: |
|
time.sleep(self.config['sleep_time']) |
|
return self.process_text(input, method_name, params, headers) |
|
elif status != 200: |
|
print('Processing failed with error ' + str(status)) |
|
return status, None |
|
else: |
|
return status, json.loads(res.text) |
|
|
|
def process(self, form_data: dict, method_name='superconductors', params={}, headers={"Accept": "application/json"}): |
|
|
|
the_url = self.get_grobid_url(method_name) |
|
params, the_url = self.get_params_from_url(the_url) |
|
|
|
res, status = self.post( |
|
url=the_url, |
|
files=form_data, |
|
data=params, |
|
headers=headers |
|
) |
|
|
|
if status == 503: |
|
time.sleep(self.config['sleep_time']) |
|
return self.process_text(input, method_name, params, headers) |
|
elif status != 200: |
|
print('Processing failed with error ' + str(status)) |
|
else: |
|
return res.text |
|
|
|
def process_pdf_batch(self, pdf_files, params={}): |
|
pass |
|
|
|
def process_pdf(self, pdf_file, method_name, params={}, headers={"Accept": "application/json"}, verbose=False, |
|
retry=None): |
|
|
|
files = { |
|
'input': ( |
|
pdf_file, |
|
open(pdf_file, 'rb'), |
|
'application/pdf', |
|
{'Expires': '0'} |
|
) |
|
} |
|
|
|
the_url = self.get_grobid_url(method_name) |
|
|
|
params, the_url = self.get_params_from_url(the_url) |
|
|
|
res, status = self.post( |
|
url=the_url, |
|
files=files, |
|
data=params, |
|
headers=headers |
|
) |
|
|
|
if status == 503 or status == 429: |
|
if retry is None: |
|
retry = self.config['max_retry'] - 1 |
|
else: |
|
if retry - 1 == 0: |
|
if verbose: |
|
print("re-try exhausted. Aborting request") |
|
return None, status |
|
else: |
|
retry -= 1 |
|
|
|
sleep_time = self.config['sleep_time'] |
|
if verbose: |
|
print("Server is saturated, waiting", sleep_time, "seconds and trying again. ") |
|
time.sleep(sleep_time) |
|
return self.process_pdf(pdf_file, method_name, params, headers, verbose=verbose, retry=retry) |
|
elif status != 200: |
|
desc = None |
|
if res.content: |
|
c = json.loads(res.text) |
|
desc = c['description'] if 'description' in c else None |
|
return desc, status |
|
elif status == 204: |
|
|
|
return None, status |
|
else: |
|
return res.text, status |
|
|
|
def get_params_from_url(self, the_url): |
|
params = {} |
|
if "?" in the_url: |
|
split = the_url.split("?") |
|
the_url = split[0] |
|
params = split[1] |
|
|
|
params = {param.split("=")[0]: param.split("=")[1] for param in params.split("&")} |
|
return params, the_url |
|
|
|
def process_json(self, text, method_name="processJson", params={}, headers={"Accept": "application/json"}, |
|
verbose=False): |
|
files = { |
|
'input': ( |
|
None, |
|
text, |
|
'application/json', |
|
{'Expires': '0'} |
|
) |
|
} |
|
|
|
the_url = self.get_grobid_url(method_name) |
|
|
|
params, the_url = self.get_params_from_url(the_url) |
|
|
|
res, status = self.post( |
|
url=the_url, |
|
files=files, |
|
data=params, |
|
headers=headers |
|
) |
|
|
|
if status == 503: |
|
time.sleep(self.config['sleep_time']) |
|
return self.process_json(text, method_name, params, headers), status |
|
elif status != 200: |
|
if verbose: |
|
print('Processing failed with error ', status) |
|
return None, status |
|
elif status == 204: |
|
if verbose: |
|
print('No content returned. Moving on. ') |
|
return None, status |
|
else: |
|
return res.text, status |
|
|