Spaces:
Sleeping
Sleeping
# this is the configuration file for the GROBID instance | |
grobid: | |
# where all the Grobid resources are stored (models, lexicon, native libraries, etc.), normally no need to change | |
grobidHome: "grobid-home" | |
# path relative to the grobid-home path (e.g. tmp for grobid-home/tmp) or absolute path (/tmp) | |
temp: "tmp" | |
# normally nothing to change here, path relative to the grobid-home path (e.g. grobid-home/lib) | |
nativelibrary: "lib" | |
pdf: | |
pdfalto: | |
# path relative to the grobid-home path (e.g. grobid-home/pdfalto), you don't want to change this normally | |
path: "pdfalto" | |
# security for PDF parsing | |
memoryLimitMb: 6096 | |
timeoutSec: 120 | |
# security relative to the PDF parsing result | |
blocksMax: 200000 | |
tokensMax: 1000000 | |
consolidation: | |
# define the bibliographical data consolidation service to be used, either "crossref" for CrossRef REST API or | |
# "glutton" for https://github.com/kermitt2/biblio-glutton | |
service: "crossref" | |
#service: "glutton" | |
glutton: | |
url: "https://cloud.science-miner.com/glutton" | |
#url: "http://localhost:8080" | |
crossref: | |
mailto: | |
# to use crossref web API, you need normally to use it politely and to indicate an email address here, e.g. | |
#mailto: "toto@titi.tutu" | |
token: | |
# to use Crossref metadata plus service (available by subscription) | |
#token: "yourmysteriouscrossrefmetadataplusauthorizationtokentobeputhere" | |
proxy: | |
# proxy to be used when doing external call to the consolidation service | |
host: | |
port: | |
# CORS configuration for the GROBID web API service | |
corsAllowedOrigins: "*" | |
corsAllowedMethods: "OPTIONS,GET,PUT,POST,DELETE,HEAD" | |
corsAllowedHeaders: "X-Requested-With,Content-Type,Accept,Origin" | |
# the actual implementation for language recognition to be used | |
languageDetectorFactory: "org.grobid.core.lang.impl.CybozuLanguageDetectorFactory" | |
# the actual implementation for optional sentence segmentation to be used (PragmaticSegmenter or OpenNLP) | |
sentenceDetectorFactory: "org.grobid.core.lang.impl.PragmaticSentenceDetectorFactory" | |
# sentenceDetectorFactory: "org.grobid.core.lang.impl.OpenNLPSentenceDetectorFactory" | |
# maximum concurrency allowed to GROBID server for processing parallel requests - change it according to your CPU/GPU capacities | |
# for a production server running only GROBID, set the value slightly above the available number of threads of the server | |
# to get best performance and security | |
concurrency: 10 | |
# when the pool is full, for queries waiting for the availability of a Grobid engine, this is the maximum time wait to try | |
# to get an engine (in seconds) - normally never change it | |
poolMaxWait: 1 | |
delft: | |
# DeLFT global parameters | |
# delft installation path if Deep Learning architectures are used to implement one of the sequence labeling model, | |
# embeddings are usually compiled as lmdb under delft/data (this parameter is ignored if only featured-engineered CRF are used) | |
install: "../delft" | |
pythonVirtualEnv: | |
wapiti: | |
# Wapiti global parameters | |
# number of threads for training the wapiti models (0 to use all available processors) | |
nbThreads: 0 | |
models: | |
# we configure here how each sequence labeling model should be implemented | |
# for feature-engineered CRF, use "wapiti" and possible training parameters are window, epsilon and nbMaxIterations | |
# for Deep Learning, use "delft" and select the target DL architecture (see DeLFT library), the training | |
# parameters then depends on this selected DL architecture | |
- name: "segmentation" | |
# at this time, must always be CRF wapiti, the input sequence size is too large for a Deep Learning implementation | |
engine: "wapiti" | |
#engine: "delft" | |
wapiti: | |
# wapiti training parameters, they will be used at training time only | |
epsilon: 0.0000001 | |
window: 50 | |
nbMaxIterations: 2000 | |
delft: | |
# deep learning parameters | |
architecture: "BidLSTM_CRF_FEATURES" | |
useELMo: false | |
runtime: | |
# parameters used at runtime/prediction | |
max_sequence_length: 3000 | |
batch_size: 1 | |
training: | |
# parameters used for training | |
max_sequence_length: 3000 | |
batch_size: 10 | |
- name: "fulltext" | |
# at this time, must always be CRF wapiti, the input sequence size is too large for a Deep Learning implementation | |
engine: "wapiti" | |
wapiti: | |
# wapiti training parameters, they will be used at training time only | |
epsilon: 0.0001 | |
window: 20 | |
nbMaxIterations: 1500 | |
- name: "header" | |
#engine: "wapiti" | |
engine: "delft" | |
wapiti: | |
# wapiti training parameters, they will be used at training time only | |
epsilon: 0.000001 | |
window: 30 | |
nbMaxIterations: 1500 | |
delft: | |
# deep learning parameters | |
architecture: "BidLSTM_ChainCRF_FEATURES" | |
#transformer: "allenai/scibert_scivocab_cased" | |
useELMo: false | |
runtime: | |
# parameters used at runtime/prediction | |
#max_sequence_length: 510 | |
max_sequence_length: 3000 | |
batch_size: 1 | |
training: | |
# parameters used for training | |
#max_sequence_length: 510 | |
#batch_size: 6 | |
max_sequence_length: 3000 | |
batch_size: 9 | |
- name: "reference-segmenter" | |
#engine: "wapiti" | |
engine: "delft" | |
wapiti: | |
# wapiti training parameters, they will be used at training time only | |
epsilon: 0.00001 | |
window: 20 | |
delft: | |
# deep learning parameters | |
architecture: "BidLSTM_ChainCRF_FEATURES" | |
useELMo: false | |
runtime: | |
# parameters used at runtime/prediction (for this model, use same max_sequence_length as training) | |
max_sequence_length: 3000 | |
batch_size: 2 | |
training: | |
# parameters used for training | |
max_sequence_length: 3000 | |
batch_size: 10 | |
- name: "name-header" | |
engine: "wapiti" | |
#engine: "delft" | |
delft: | |
# deep learning parameters | |
architecture: "BidLSTM_CRF_FEATURES" | |
- name: "name-citation" | |
engine: "wapiti" | |
#engine: "delft" | |
delft: | |
# deep learning parameters | |
architecture: "BidLSTM_CRF_FEATURES" | |
- name: "date" | |
engine: "wapiti" | |
#engine: "delft" | |
delft: | |
# deep learning parameters | |
architecture: "BidLSTM_CRF_FEATURES" | |
- name: "figure" | |
engine: "wapiti" | |
#engine: "delft" | |
wapiti: | |
# wapiti training parameters, they will be used at training time only | |
epsilon: 0.00001 | |
window: 20 | |
delft: | |
# deep learning parameters | |
architecture: "BidLSTM_CRF" | |
- name: "table" | |
engine: "wapiti" | |
#engine: "delft" | |
wapiti: | |
# wapiti training parameters, they will be used at training time only | |
epsilon: 0.00001 | |
window: 20 | |
delft: | |
# deep learning parameters | |
architecture: "BidLSTM_CRF" | |
- name: "affiliation-address" | |
#engine: "wapiti" | |
engine: "delft" | |
delft: | |
# deep learning parameters | |
architecture: "BidLSTM_CRF_FEATURES" | |
- name: "citation" | |
#engine: "wapiti" | |
engine: "delft" | |
wapiti: | |
# wapiti training parameters, they will be used at training time only | |
epsilon: 0.00001 | |
window: 50 | |
nbMaxIterations: 3000 | |
delft: | |
# deep learning parameters | |
architecture: "BidLSTM_CRF_FEATURES" | |
#architecture: "BERT_CRF" | |
#transformer: "michiyasunaga/LinkBERT-base" | |
useELMo: false | |
runtime: | |
# parameters used at runtime/prediction | |
max_sequence_length: 500 | |
batch_size: 30 | |
training: | |
# parameters used for training | |
max_sequence_length: 500 | |
batch_size: 50 | |
- name: "patent-citation" | |
engine: "wapiti" | |
#engine: "delft" | |
wapiti: | |
# wapiti training parameters, they will be used at training time only | |
epsilon: 0.0001 | |
window: 20 | |
delft: | |
# deep learning parameters | |
architecture: "BidLSTM_CRF_FEATURES" | |
#architecture: "BERT_CRF" | |
runtime: | |
# parameters used at runtime/prediction | |
max_sequence_length: 800 | |
batch_size: 20 | |
training: | |
# parameters used for training | |
max_sequence_length: 1000 | |
batch_size: 40 | |
- name: "funding-acknowledgement" | |
engine: "wapiti" | |
#engine: "delft" | |
wapiti: | |
# wapiti training parameters, they will be used at training time only | |
epsilon: 0.00001 | |
window: 50 | |
nbMaxIterations: 2000 | |
delft: | |
# deep learning parameters | |
architecture: "BidLSTM_CRF_FEATURES" | |
#architecture: "BERT_CRF" | |
#transformer: "michiyasunaga/LinkBERT-base" | |
useELMo: false | |
runtime: | |
# parameters used at runtime/prediction | |
max_sequence_length: 800 | |
batch_size: 20 | |
training: | |
# parameters used for training | |
max_sequence_length: 500 | |
batch_size: 40 | |
- name: "copyright" | |
# at this time, we only have a DeLFT implementation, | |
# use "wapiti" if the deep learning library JNI is not available and model will then be ignored | |
#engine: "delft" | |
engine: "wapiti" | |
delft: | |
# deep learning parameters | |
architecture: "gru" | |
#architecture: "bert" | |
#transformer: "allenai/scibert_scivocab_cased" | |
- name: "license" | |
# at this time, for being active, it must be DeLFT, no other implementation is available | |
# use "wapiti" if the deep learning library JNI is not available and model will then be ignored | |
#engine: "delft" | |
engine: "wapiti" | |
delft: | |
# deep learning parameters | |
architecture: "gru" | |
#architecture: "bert" | |
#transformer: "allenai/scibert_scivocab_cased" | |
# for **service only**: how to load the models, | |
# false -> models are loaded when needed, avoiding putting in memory useless models (only in case of CRF) but slow down | |
# significantly the service at first call | |
# true -> all the models are loaded into memory at the server startup (default), slow the start of the services | |
# and models not used will take some more memory (only in case of CRF), but server is immediatly warm and ready | |
modelPreload: true | |
server: | |
type: custom | |
applicationConnectors: | |
- type: http | |
port: 8070 | |
adminConnectors: | |
- type: http | |
port: 8071 | |
registerDefaultExceptionMappers: false | |
# change the following for having all http requests logged | |
requestLog: | |
appenders: [] | |
# these logging settings apply to the Grobid service usage mode | |
logging: | |
level: INFO | |
loggers: | |
org.apache.pdfbox.pdmodel.font.PDSimpleFont: "OFF" | |
org.glassfish.jersey.internal: "OFF" | |
com.squarespace.jersey2.guice.JerseyGuiceUtils: "OFF" | |
appenders: | |
- type: console | |
threshold: INFO | |
timeZone: UTC | |
# uncomment to have the logs in json format | |
# layout: | |
# type: json | |