File size: 11,300 Bytes
f6bfed1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
de1b677
 
f6bfed1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c8fe9c4
 
f6bfed1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c8fe9c4
 
f6bfed1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c8fe9c4
 
f6bfed1
 
 
 
 
c8fe9c4
 
f6bfed1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2d2c11b
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
# this is the configuration file for the GROBID instance

grobid:
  # where all the Grobid resources are stored (models, lexicon, native libraries, etc.), normally no need to change
  grobidHome: "grobid-home"

  # path relative to the grobid-home path (e.g. tmp for grobid-home/tmp) or absolute path (/tmp)
  temp: "tmp"
  
  # normally nothing to change here, path relative to the grobid-home path (e.g. grobid-home/lib)
  nativelibrary: "lib"

  pdf:
    pdfalto:
      # path relative to the grobid-home path (e.g. grobid-home/pdfalto), you don't want to change this normally
      path: "pdfalto"
      # security for PDF parsing
      memoryLimitMb: 6096
      timeoutSec: 120

    # security relative to the PDF parsing result
    blocksMax: 200000
    tokensMax: 1000000

  consolidation:
    # define the bibliographical data consolidation service to be used, either "crossref" for CrossRef REST API or 
    # "glutton" for https://github.com/kermitt2/biblio-glutton
    service: "crossref"
    #service: "glutton"
    glutton:
      url: "https://cloud.science-miner.com/glutton"
      #url: "http://localhost:8080" 
    crossref:
      mailto: 
      # to use crossref web API, you need normally to use it politely and to indicate an email address here, e.g. 
      #mailto: "toto@titi.tutu"
      token:
      # to use Crossref metadata plus service (available by subscription)
      #token: "yourmysteriouscrossrefmetadataplusauthorizationtokentobeputhere"

  proxy:
    # proxy to be used when doing external call to the consolidation service
    host: 
    port: 

  # CORS configuration for the GROBID web API service
  corsAllowedOrigins: "*"
  corsAllowedMethods: "OPTIONS,GET,PUT,POST,DELETE,HEAD"
  corsAllowedHeaders: "X-Requested-With,Content-Type,Accept,Origin"

  # the actual implementation for language recognition to be used
  languageDetectorFactory: "org.grobid.core.lang.impl.CybozuLanguageDetectorFactory"

  # the actual implementation for optional sentence segmentation to be used (PragmaticSegmenter or OpenNLP)
  sentenceDetectorFactory: "org.grobid.core.lang.impl.PragmaticSentenceDetectorFactory"
  # sentenceDetectorFactory: "org.grobid.core.lang.impl.OpenNLPSentenceDetectorFactory"
  
  # maximum concurrency allowed to GROBID server for processing parallel requests - change it according to your CPU/GPU capacities
  # for a production server running only GROBID, set the value slightly above the available number of threads of the server
  # to get best performance and security
  concurrency: 10
  # when the pool is full, for queries waiting for the availability of a Grobid engine, this is the maximum time wait to try 
  # to get an engine (in seconds) - normally never change it
  poolMaxWait: 1

  delft:
    # DeLFT global parameters
    # delft installation path if Deep Learning architectures are used to implement one of the sequence labeling model, 
    # embeddings are usually compiled as lmdb under delft/data (this parameter is ignored if only featured-engineered CRF are used)
    install: "../delft"
    pythonVirtualEnv:

  wapiti:
    # Wapiti global parameters
    # number of threads for training the wapiti models (0 to use all available processors)
    nbThreads: 0

  models:
    # we configure here how each sequence labeling model should be implemented
    # for feature-engineered CRF, use "wapiti" and possible training parameters are window, epsilon and nbMaxIterations
    # for Deep Learning, use "delft" and select the target DL architecture (see DeLFT library), the training 
    # parameters then depends on this selected DL architecture 
    
    - name: "segmentation"
      # at this time, must always be CRF wapiti, the input sequence size is too large for a Deep Learning implementation
      engine: "wapiti"
      #engine: "delft"
      wapiti:
        # wapiti training parameters, they will be used at training time only
        epsilon: 0.0000001
        window: 50
        nbMaxIterations: 2000
      delft:
        # deep learning parameters
        architecture: "BidLSTM_CRF_FEATURES"
        useELMo: false
        runtime:
          # parameters used at runtime/prediction
          max_sequence_length: 3000
          batch_size: 1
        training:
          # parameters used for training
          max_sequence_length: 3000
          batch_size: 10

    - name: "fulltext"
      # at this time, must always be CRF wapiti, the input sequence size is too large for a Deep Learning implementation
      engine: "wapiti"
      wapiti:
        # wapiti training parameters, they will be used at training time only
        epsilon: 0.0001
        window: 20
        nbMaxIterations: 1500

    - name: "header"
      #engine: "wapiti"
      engine: "delft"
      wapiti:
        # wapiti training parameters, they will be used at training time only  
        epsilon: 0.000001
        window: 30
        nbMaxIterations: 1500
      delft:
        # deep learning parameters
        architecture: "BidLSTM_ChainCRF_FEATURES"
        #transformer: "allenai/scibert_scivocab_cased"
        useELMo: false
        runtime:
          # parameters used at runtime/prediction
          #max_sequence_length: 510
          max_sequence_length: 3000
          batch_size: 1
        training:
          # parameters used for training
          #max_sequence_length: 510
          #batch_size: 6
          max_sequence_length: 3000
          batch_size: 9

    - name: "reference-segmenter"
      #engine: "wapiti"
      engine: "delft"
      wapiti:
        # wapiti training parameters, they will be used at training time only
        epsilon: 0.00001
        window: 20
      delft:
        # deep learning parameters
        architecture: "BidLSTM_ChainCRF_FEATURES"
        useELMo: false
        runtime:
          # parameters used at runtime/prediction (for this model, use same max_sequence_length as training)
          max_sequence_length: 3000
          batch_size: 2
        training:
          # parameters used for training
          max_sequence_length: 3000
          batch_size: 10

    - name: "name-header"
      engine: "wapiti"
      #engine: "delft"
      delft:
        # deep learning parameters
        architecture: "BidLSTM_CRF_FEATURES"

    - name: "name-citation"
      engine: "wapiti"
      #engine: "delft"
      delft:
        # deep learning parameters
        architecture: "BidLSTM_CRF_FEATURES"

    - name: "date"
      engine: "wapiti"
      #engine: "delft"
      delft:
        # deep learning parameters
        architecture: "BidLSTM_CRF_FEATURES"

    - name: "figure"
      engine: "wapiti"
      #engine: "delft"
      wapiti:
        # wapiti training parameters, they will be used at training time only
        epsilon: 0.00001
        window: 20
      delft:
        # deep learning parameters
        architecture: "BidLSTM_CRF"

    - name: "table"
      engine: "wapiti"
      #engine: "delft"
      wapiti:
        # wapiti training parameters, they will be used at training time only
        epsilon: 0.00001
        window: 20
      delft:  
        # deep learning parameters
        architecture: "BidLSTM_CRF"

    - name: "affiliation-address"
      #engine: "wapiti"
      engine: "delft"
      delft:
        # deep learning parameters
        architecture: "BidLSTM_CRF_FEATURES"

    - name: "citation"
      #engine: "wapiti"
      engine: "delft"
      wapiti:
        # wapiti training parameters, they will be used at training time only
        epsilon: 0.00001
        window: 50
        nbMaxIterations: 3000
      delft:
        # deep learning parameters
        architecture: "BidLSTM_CRF_FEATURES"
        #architecture: "BERT_CRF"
        #transformer: "michiyasunaga/LinkBERT-base"
        useELMo: false
        runtime:
          # parameters used at runtime/prediction
          max_sequence_length: 500
          batch_size: 30
        training:
          # parameters used for training
          max_sequence_length: 500  
          batch_size: 50

    - name: "patent-citation"
      engine: "wapiti"
      #engine: "delft"
      wapiti:
        # wapiti training parameters, they will be used at training time only
        epsilon: 0.0001
        window: 20
      delft:
        # deep learning parameters
        architecture: "BidLSTM_CRF_FEATURES"
        #architecture: "BERT_CRF"
        runtime:
          # parameters used at runtime/prediction
          max_sequence_length: 800
          batch_size: 20
        training:
          # parameters used for training
          max_sequence_length: 1000
          batch_size: 40

    - name: "funding-acknowledgement"
      engine: "wapiti"
      #engine: "delft"
      wapiti:
        # wapiti training parameters, they will be used at training time only
        epsilon: 0.00001
        window: 50
        nbMaxIterations: 2000
      delft:
        # deep learning parameters
        architecture: "BidLSTM_CRF_FEATURES"
        #architecture: "BERT_CRF"
        #transformer: "michiyasunaga/LinkBERT-base"
        useELMo: false
        runtime:
          # parameters used at runtime/prediction
          max_sequence_length: 800
          batch_size: 20
        training:
          # parameters used for training
          max_sequence_length: 500  
          batch_size: 40

    - name: "copyright"
      # at this time, we only have a DeLFT implementation, 
      # use "wapiti" if the deep learning library JNI is not available and model will then be ignored
      #engine: "delft"
      engine: "wapiti"
      delft:
        # deep learning parameters
        architecture: "gru"
        #architecture: "bert"
        #transformer: "allenai/scibert_scivocab_cased"

    - name: "license"
      # at this time, for being active, it must be DeLFT, no other implementation is available
      # use "wapiti" if the deep learning library JNI is not available and model will then be ignored
      #engine: "delft"
      engine: "wapiti"
      delft:
        # deep learning parameters
        architecture: "gru"
        #architecture: "bert"
        #transformer: "allenai/scibert_scivocab_cased"

  # for **service only**: how to load the models, 
  # false -> models are loaded when needed, avoiding putting in memory useless models (only in case of CRF) but slow down 
  #          significantly the service at first call
  # true -> all the models are loaded into memory at the server startup (default), slow the start of the services 
  #         and models not used will take some more memory (only in case of CRF), but server is immediatly warm and ready
  modelPreload: true

server:
    type: custom
    applicationConnectors:
    - type: http
      port: 8070
    adminConnectors:
    - type: http
      port: 8071
    registerDefaultExceptionMappers: false
    # change the following for having all http requests logged
    requestLog:
      appenders: []

# these logging settings apply to the Grobid service usage mode
logging:
  level: INFO
  loggers:
    org.apache.pdfbox.pdmodel.font.PDSimpleFont: "OFF"
    org.glassfish.jersey.internal: "OFF"
    com.squarespace.jersey2.guice.JerseyGuiceUtils: "OFF"
  appenders:
    - type: console
      threshold: INFO
      timeZone: UTC
      # uncomment to have the logs in json format
      # layout:
       # type: json