File size: 2,485 Bytes
42df499
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
task_name: general
model_name: bge
model_dir: /iyunwen/nlpdata/PublicPretrainedModel/bge-base-zh/
use_deepspeed: true
desc: "piccolo"
train_method: "ewc"
ewc_ratio: 10.0
cosent_ratio: 20.0
in_batch_ratio: 30.0
save_steps: 50
hard_neg_ratio: 0.2
in_batch_train_paths:
  # synthetic_qp里的qp还是bge的向量
  synthetic_qp:
    - /iyunwen/nlpdata/work/LP/Data/VecData/v2/wudao_synthetic_alpaca2_hfl_0_100000_vec_neg.jsonl
    - /iyunwen/nlpdata/work/LP/Data/VecData/v2/m3e_synthetic_alpaca2_hfl_0_100000_vec_neg.jsonl
  # normal里的hard neg默认是bm25
  normal:
    - /iyunwen/nlpdata/work/LP/Data/VecData/v2/m3e_long_length_hard_neg.jsonl
    - /iyunwen/nlpdata/work/LP/Data/VecData/v2/wudao_long_length_hard_neg.jsonl
    - /iyunwen/nlpdata/work/LP/Data/VecData/stella/mrc_data.jsonl
    - /iyunwen/nlpdata/work/LP/Data/VecData/stella/guowang_data.jsonl


pair_train_paths:
  binclf:
    - /iyunwen/nlpdata/work/LP/Data/VecData/v2/binclf_data.jsonl
  nli:
    - /iyunwen/nlpdata/work/LP/Data/VecData/v2/nli_data.jsonl

loader_idxs: null
in_batch_bsz: 128
pair_bsz: 128
max_length: 512

auto_ouput_dir: false
train_args:
  seed: 666
  output_dir: /iyunwen/nlpdata/work/LP/model_path/vec_embedding/stella/s4/
  evaluation_strategy: "no"
  num_train_epochs: 4
  logging_steps: 9999999
  eval_steps: 9999999
  per_device_train_batch_size: 128
  gradient_accumulation_steps: 1
  per_device_eval_batch_size: 32
  learning_rate: 5.0e-06
  weight_decay: 0.00001
  warmup_ratio: 0.05
  lr_scheduler_type: "linear"
  dataloader_drop_last: false

  fp16: true
  gradient_checkpointing: true
  deepspeed:
    fp16:
      enabled: true
      hysteresis: 2
      initial_scale_power: 16
      loss_scale: 0
      loss_scale_window: 1000
      min_loss_scale: 1
    train_micro_batch_size_per_gpu: 128
    train_batch_size: "auto"
    gradient_accumulation_steps: 1
    gradient_clipping: auto
    optimizer:
      params:
        adam_w_mode: true
        lr: 1e-6
        torch_adam: true
        weight_decay: auto
      type: AdamW
    scheduler:
      params:
        total_num_steps: auto
        warmup_max_lr: auto
        warmup_min_lr: auto
        warmup_num_steps: auto
      type: WarmupDecayLR
    steps_per_print: 4
    wall_clock_breakdown: false
    zero_optimization:
      allgather_bucket_size: 200000000.0
      allgather_partitions: true
      contiguous_gradients: true
      overlap_comm: true
      reduce_bucket_size: auto
      reduce_scatter: true
      stage: 0