Graph Machine Learning
AnemoI
English
jpxkqx commited on
Commit
9bc9286
·
verified ·
1 Parent(s): 96ad5cc

Upload config_pretraining.yaml

Browse files
Files changed (1) hide show
  1. config_pretraining.yaml +540 -0
config_pretraining.yaml ADDED
@@ -0,0 +1,540 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ data:
2
+ format: zarr
3
+ resolution: n320
4
+ frequency: 6h
5
+ timestep: 6h
6
+ forcing:
7
+ - cos_latitude
8
+ - cos_longitude
9
+ - sin_latitude
10
+ - sin_longitude
11
+ - cos_julian_day
12
+ - cos_local_time
13
+ - sin_julian_day
14
+ - sin_local_time
15
+ - insolation
16
+ - lsm
17
+ - sdor
18
+ - slor
19
+ - z
20
+ diagnostic:
21
+ - tp
22
+ - cp
23
+ - sf
24
+ - tcc
25
+ - hcc
26
+ - lcc
27
+ - mcc
28
+ - ro
29
+ - ssrd
30
+ - strd
31
+ - 100u
32
+ - 100v
33
+ remapped: null
34
+ normalizer:
35
+ default: mean-std
36
+ remap:
37
+ cp: tp
38
+ sf: tp
39
+ std:
40
+ - tp
41
+ - cp
42
+ - sf
43
+ - ro
44
+ - tcw
45
+ - ssrd
46
+ - q_50
47
+ - q_100
48
+ - q_150
49
+ - q_200
50
+ - q_250
51
+ - q_300
52
+ - q_400
53
+ - q_500
54
+ - q_600
55
+ - q_700
56
+ - q_850
57
+ - q_925
58
+ - q_1000
59
+ min-max: null
60
+ max:
61
+ - sdor
62
+ - slor
63
+ - z
64
+ none:
65
+ - cos_latitude
66
+ - cos_longitude
67
+ - sin_latitude
68
+ - sin_longitude
69
+ - cos_julian_day
70
+ - cos_local_time
71
+ - sin_julian_day
72
+ - sin_local_time
73
+ - insolation
74
+ - lsm
75
+ - tcc
76
+ - mcc
77
+ - hcc
78
+ - lcc
79
+ - swvl1
80
+ - swvl2
81
+ imputer:
82
+ default: none
83
+ remapper:
84
+ default: none
85
+ processors:
86
+ normalizer:
87
+ _target_: anemoi.models.preprocessing.normalizer.InputNormalizer
88
+ _convert_: all
89
+ config:
90
+ default: mean-std
91
+ remap:
92
+ cp: tp
93
+ sf: tp
94
+ std:
95
+ - tp
96
+ - cp
97
+ - sf
98
+ - ro
99
+ - tcw
100
+ - ssrd
101
+ - q_50
102
+ - q_100
103
+ - q_150
104
+ - q_200
105
+ - q_250
106
+ - q_300
107
+ - q_400
108
+ - q_500
109
+ - q_600
110
+ - q_700
111
+ - q_850
112
+ - q_925
113
+ - q_1000
114
+ min-max: null
115
+ max:
116
+ - sdor
117
+ - slor
118
+ - z
119
+ none:
120
+ - cos_latitude
121
+ - cos_longitude
122
+ - sin_latitude
123
+ - sin_longitude
124
+ - cos_julian_day
125
+ - cos_local_time
126
+ - sin_julian_day
127
+ - sin_local_time
128
+ - insolation
129
+ - lsm
130
+ - tcc
131
+ - mcc
132
+ - hcc
133
+ - lcc
134
+ - swvl1
135
+ - swvl2
136
+ num_features: 115
137
+
138
+ dataloader:
139
+ prefetch_factor: 2
140
+ pin_memory: True
141
+ read_group_size: 4
142
+ num_workers:
143
+ training: 4
144
+ validation: 4
145
+ test: 8
146
+ predict: 8
147
+ batch_size:
148
+ training: 1
149
+ validation: 1
150
+ test: 4
151
+ predict: 4
152
+ limit_batches:
153
+ training: null
154
+ validation: 10
155
+ test: 20
156
+ predict: 20
157
+ dataset: ${hardware.paths.data}/${hardware.files.dataset}
158
+ land_dataset: ${hardware.paths.data}/${hardware.files.dataset_land}
159
+ training:
160
+ dataset:
161
+ - dataset: ${dataloader.dataset}
162
+ start: null
163
+ end: 2022
164
+ frequency: ${data.frequency}
165
+ drop: []
166
+ - dataset: ${dataloader.land_dataset}
167
+ start: null
168
+ end: 2022
169
+ frequency: ${data.frequency}
170
+ drop:
171
+ - anor
172
+ - isor
173
+ - lsp
174
+ - stl3
175
+ - swvl3
176
+ - tsn
177
+ - rsn
178
+ - sd
179
+ - tvh
180
+ - tvl
181
+ - cvh
182
+ - cvl
183
+ - cl
184
+ - slt
185
+ - lai_lv
186
+ - lai_hv
187
+ start: null
188
+ end: 2022
189
+ drop: []
190
+ validation:
191
+ dataset:
192
+ - dataset: ${dataloader.dataset}
193
+ start: 2022
194
+ end: 2022
195
+ frequency: ${data.frequency}
196
+ drop: []
197
+ - dataset: ${dataloader.land_dataset}
198
+ start: 2022
199
+ end: 2022
200
+ frequency: ${data.frequency}
201
+ drop:
202
+ - anor
203
+ - isor
204
+ - lsp
205
+ - stl3
206
+ - swvl3
207
+ - tsn
208
+ - rsn
209
+ - sd
210
+ - tvh
211
+ - tvl
212
+ - cvh
213
+ - cvl
214
+ - cl
215
+ - slt
216
+ - lai_lv
217
+ - lai_hv
218
+ start: 2022
219
+ end: 2022
220
+ drop: []
221
+ validation_rollout: 1
222
+
223
+ diagnostics:
224
+ plot:
225
+ asynchronous: False
226
+ datashader: True
227
+ frequency:
228
+ batch: 750
229
+ epoch: 10
230
+ parameters:
231
+ - tp
232
+ sample_idx: 0
233
+ precip_and_related_fields:
234
+ - tp
235
+ - cp
236
+ callbacks:
237
+ - _target_: anemoi.training.diagnostics.callbacks.plot.PlotLoss
238
+ parameter_groups:
239
+ moisture: [tp, cp, tcw]
240
+ sfc_wind: [10u, 10v]
241
+ - _target_: anemoi.training.diagnostics.callbacks.plot.PlotSample
242
+ sample_idx: 0
243
+ per_sample: 6
244
+ parameters: [tp]
245
+ accumulation_levels_plot: [0, 0.05, 0.1, 0.25, 0.5, 1, 1.5, 2, 3, 4, 5, 6, 7, 100]
246
+ cmap_accumulation:
247
+ - "#ffffff"
248
+ - "#04e9e7"
249
+ - "#019ff4"
250
+ - "#0300f4"
251
+ - "#02fd02"
252
+ - "#01c501"
253
+ - "#008e00"
254
+ - "#fdf802"
255
+ - "#e5bc00"
256
+ - "#fd9500"
257
+ - "#fd0000"
258
+ - "#d40000"
259
+ - "#bc0000"
260
+ - "#f800fd"
261
+ precip_and_related_fields: [tp, cp]
262
+ enabled: True
263
+ scatter: False
264
+ mode: asyncio
265
+ callbacks: {}
266
+ benchmark_profiler:
267
+ memory:
268
+ enabled: True
269
+ steps: 5
270
+ warmup: 2
271
+ extra_plots: False
272
+ trace_rank0_only: False
273
+ time:
274
+ enabled: True
275
+ verbose: False
276
+ speed:
277
+ enabled: True
278
+ system:
279
+ enabled: True
280
+ model_summary:
281
+ enabled: True
282
+ snapshot:
283
+ enabled: True
284
+ steps: 4
285
+ warmup: 0
286
+ debug:
287
+ anomaly_detection: False
288
+ profiler: False
289
+ enable_checkpointing: True
290
+ checkpoint:
291
+ every_n_minutes:
292
+ save_frequency: 30
293
+ num_models_saved: 3
294
+ every_n_epochs:
295
+ save_frequency: 1
296
+ num_models_saved: 3
297
+ every_n_train_steps:
298
+ save_frequency: null
299
+ num_models_saved: 0
300
+ log:
301
+ wandb:
302
+ enabled: False
303
+ tensorboard:
304
+ enabled: False
305
+ mlflow:
306
+ enabled: True
307
+ offline: True
308
+ authentication: False
309
+ log_model: False
310
+ tracking_uri: ${oc.decode:${oc.env:MLFLOW_TRACKING_URI}}
311
+ experiment_name: aifs-single-benchmark
312
+ project_name: Anemoi
313
+ system: True
314
+ terminal: True
315
+ run_name: aifs-single-v1.0-pretraining
316
+ on_resume_create_child: True
317
+ expand_hyperparams:
318
+ - config
319
+ interval: 100
320
+ enable_progress_bar: True
321
+ print_memory_summary: False
322
+
323
+ hardware:
324
+ paths:
325
+ data: ${oc.decode:${oc.env:DATASETS_PATH}}
326
+ output: ${oc.decode:${oc.env:OUTPUT_DIR}}
327
+ logs:
328
+ base: ${harware.paths.output}/logs
329
+ wandb: ${harware.paths.output}/logs
330
+ mlflow: ${harware.paths.output}/logs/mlflow
331
+ tensorboard: ${harware.paths.output}/logs/tensorboard
332
+ checkpoints: ${harware.paths.output}/checkpoint/f6fa3588874441789433af098d660844
333
+ plots: ${harware.paths.output}/plots/f6fa3588874441789433af098d660844
334
+ profiler: ${harware.paths.output}/profiler/
335
+ graph: ${harware.paths.output}/graphs/
336
+ files:
337
+ dataset: aifs-ea-an-oper-0001-mars-n320-1979-2022-6h-v6.zarr
338
+ dataset_land: aifs-ea-an-oper-0001-mars-n320-1979-2023-6h-v1-land.zarr
339
+ dataset_precip: aifs-od-an-oper-0001-mars-n320-2016-2023-6h-v2-precipitations.zarr
340
+ graph: graph_enc_proc_dec_n320.pt
341
+ checkpoint:
342
+ every_n_epochs: aifs-by_epoch-epoch_{epoch:03d}-val_wmse_{val_wmse:.3e}
343
+ every_n_train_steps: aifs-by_step-epoch_{epoch:03d}-step_{step:06d}
344
+ every_n_minutes: aifs-by_time-epoch_{epoch:03d}-step_{step:06d}
345
+ warm_start: null
346
+ accelerator: auto
347
+ num_gpus_per_node: 4
348
+ num_nodes: 16
349
+ num_gpus_per_model: 4
350
+
351
+ graph:
352
+ overwrite: True
353
+ data: data
354
+ hidden: hidden
355
+ nodes:
356
+ data:
357
+ node_builder:
358
+ _target_: anemoi.graphs.nodes.ZarrDatasetNodes
359
+ dataset: ${dataloader.dataset}
360
+ attributes:
361
+ area_weight:
362
+ _target_: anemoi.graphs.nodes.attributes.AreaWeights
363
+ norm: unit-max
364
+ hidden:
365
+ node_builder:
366
+ _target_: anemoi.graphs.nodes.ReducedGaussianGridNodes
367
+ grid: o96
368
+ edges:
369
+ - source_name: data
370
+ target_name: hidden
371
+ edge_builder:
372
+ _target_: anemoi.graphs.edges.CutOffEdges
373
+ cutoff_factor: 0.6
374
+ attributes:
375
+ edge_length:
376
+ _target_: anemoi.graphs.edges.attributes.EdgeLength
377
+ norm: unit-std
378
+ edge_dirs:
379
+ _target_: anemoi.graphs.edges.attributes.EdgeDirection
380
+ norm: unit-std
381
+ - source_name: hidden
382
+ target_name: data
383
+ edge_builder:
384
+ _target_: anemoi.graphs.edges.KNNEdges
385
+ num_nearest_neighbours: 3
386
+ attributes:
387
+ edge_length:
388
+ _target_: anemoi.graphs.edges.attributes.EdgeLength
389
+ norm: unit-std
390
+ edge_dirs:
391
+ _target_: anemoi.graphs.edges.attributes.EdgeDirection
392
+ norm: unit-std
393
+
394
+ model:
395
+ activation: GELU
396
+ num_channels: 1024
397
+ model:
398
+ _target_: anemoi.models.models.encoder_processor_decoder.AnemoiModelEncProcDec
399
+ processor:
400
+ _target_: anemoi.models.layers.processor.TransformerProcessor
401
+ _convert_: all
402
+ activation: GELU
403
+ num_layers: 16
404
+ num_chunks: 2
405
+ mlp_hidden_ratio: 4
406
+ num_heads: 16
407
+ window_size: 1120
408
+ dropout_p: 0
409
+ encoder:
410
+ _target_: anemoi.models.layers.mapper.GraphTransformerForwardMapper
411
+ _convert_: all
412
+ trainable_size: 8
413
+ sub_graph_edge_attributes: [edge_length, edge_dirs]
414
+ activation: GELU
415
+ num_chunks: 1
416
+ mlp_hidden_ratio: 4
417
+ num_heads: 16
418
+ decoder:
419
+ _target_: anemoi.models.layers.mapper.GraphTransformerBackwardMapper
420
+ _convert_: all
421
+ trainable_size: 8
422
+ sub_graph_edge_attributes: [edge_length, edge_dirs]
423
+ activation: GELU
424
+ num_chunks: 1
425
+ mlp_hidden_ratio: 4
426
+ num_heads: 16
427
+ trainable_parameters:
428
+ data: 8
429
+ hidden: 8
430
+ data2hidden: 8
431
+ hidden2data: 8
432
+ attributes:
433
+ edges: [edge_length, edge_dirs]
434
+ nodes: []
435
+ node_loss_weight: area_weight
436
+ bounding:
437
+ - _target_: anemoi.models.layers.bounding.ReluBounding
438
+ variables:
439
+ - tp
440
+ - ro
441
+ - tcw
442
+ - ssrd
443
+ - q_50
444
+ - q_100
445
+ - q_150
446
+ - q_200
447
+ - q_250
448
+ - q_300
449
+ - q_400
450
+ - q_500
451
+ - q_600
452
+ - q_700
453
+ - q_850
454
+ - q_925
455
+ - q_1000
456
+ - _target_: anemoi.models.layers.bounding.HardtanhBounding
457
+ variables: [tcc, swvl1, swvl2]
458
+ min_val: 0
459
+ max_val: 1
460
+ - _target_: anemoi.models.layers.bounding.FractionBounding
461
+ variables: [cp, sf]
462
+ min_val: 0
463
+ max_val: 1
464
+ total_var: tp
465
+ - _target_: anemoi.models.layers.bounding.FractionBounding
466
+ variables: [lcc, mcc, hcc]
467
+ min_val: 0
468
+ max_val: 1
469
+ total_var: tcc
470
+
471
+ training:
472
+ run_id: null
473
+ fork_run_id: null
474
+ load_weights_only: null
475
+ deterministic: False
476
+ precision: 16-mixed
477
+ multistep_input: 2
478
+ accum_grad_batches: 1
479
+ num_sanity_val_steps: 6
480
+ gradient_clip:
481
+ val: 32
482
+ algorithm: value
483
+ swa:
484
+ enabled: False
485
+ lr: 0.0001
486
+ zero_optimizer: False
487
+ training_loss:
488
+ _target_: anemoi.training.losses.mse.WeightedMSELoss
489
+ scalars: [variable, loss_weights_mask]
490
+ ignore_nans: False
491
+ loss_gradient_scaling: False
492
+ validation_metrics:
493
+ - _target_: anemoi.training.losses.mse.WeightedMSELoss
494
+ scalars: []
495
+ ignore_nans: True
496
+ rollout:
497
+ start: 1
498
+ epoch_increment: 0
499
+ max: 1
500
+ max_epochs: null
501
+ max_steps: 260000
502
+ lr:
503
+ rate: 0.00003125
504
+ iterations: 260000
505
+ min: 3.0e-7
506
+ variable_loss_scaling:
507
+ default: 1
508
+ pl:
509
+ q: 0.6
510
+ t: 6
511
+ u: 0.8
512
+ v: 0.5
513
+ w: 0.001
514
+ z: 12
515
+ sfc:
516
+ sp: 10
517
+ 10u: 0.5
518
+ 10v: 0.5
519
+ 100u: 0.1
520
+ 100v: 0.1
521
+ 2d: 0.5
522
+ tp: 0.025
523
+ cp: 0.0025
524
+ ro: 0.005
525
+ sf: 0.025
526
+ tcc: 0.1
527
+ mcc: 0.1
528
+ lcc: 0.1
529
+ hcc: 0.1
530
+ swvl2: 200
531
+ swvl1: 100
532
+ stl2: 10
533
+ stl1: 1
534
+ ssrd: 0.05
535
+ strd: 0.1
536
+ metrics: [z_500, t_850, u_850, v_850]
537
+ pressure_level_scaler:
538
+ _target_: anemoi.training.data.scaling.ReluPressureLevelScaler
539
+ minimum: 0.2
540
+ slope: 0.001