TheRootOf3 commited on
Commit
a697a5f
1 Parent(s): 083e6c9

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. adapted_models/cpt+it/prop-0.25/opt_100-add/100M-amh/added_tokens.json +102 -0
  2. adapted_models/cpt+it/prop-0.25/opt_100-add/100M-amh/all_results.json +1 -0
  3. adapted_models/cpt+it/prop-0.25/opt_100-add/100M-amh/config.json +31 -0
  4. adapted_models/cpt+it/prop-0.25/opt_100-add/100M-amh/generation_config.json +7 -0
  5. adapted_models/cpt+it/prop-0.25/opt_100-add/100M-amh/merges.txt +0 -0
  6. adapted_models/cpt+it/prop-0.25/opt_100-add/100M-amh/pytorch_model.bin +3 -0
  7. adapted_models/cpt+it/prop-0.25/opt_100-add/100M-amh/special_tokens_map.json +30 -0
  8. adapted_models/cpt+it/prop-0.25/opt_100-add/100M-amh/tokenizer.json +0 -0
  9. adapted_models/cpt+it/prop-0.25/opt_100-add/100M-amh/tokenizer_config.json +830 -0
  10. adapted_models/cpt+it/prop-0.25/opt_100-add/100M-amh/vocab.json +0 -0
  11. adapted_models/cpt+it/prop-0.25/opt_100-add/100M-hau/added_tokens.json +102 -0
  12. adapted_models/cpt+it/prop-0.25/opt_100-add/100M-hau/all_results.json +1 -0
  13. adapted_models/cpt+it/prop-0.25/opt_100-add/100M-hau/config.json +31 -0
  14. adapted_models/cpt+it/prop-0.25/opt_100-add/100M-hau/generation_config.json +7 -0
  15. adapted_models/cpt+it/prop-0.25/opt_100-add/100M-hau/merges.txt +0 -0
  16. adapted_models/cpt+it/prop-0.25/opt_100-add/100M-hau/pytorch_model.bin +3 -0
  17. adapted_models/cpt+it/prop-0.25/opt_100-add/100M-hau/special_tokens_map.json +30 -0
  18. adapted_models/cpt+it/prop-0.25/opt_100-add/100M-hau/tokenizer.json +0 -0
  19. adapted_models/cpt+it/prop-0.25/opt_100-add/100M-hau/tokenizer_config.json +830 -0
  20. adapted_models/cpt+it/prop-0.25/opt_100-add/100M-hau/vocab.json +0 -0
  21. adapted_models/cpt+it/prop-0.25/opt_100-add/100M-ibo/added_tokens.json +102 -0
  22. adapted_models/cpt+it/prop-0.25/opt_100-add/100M-ibo/all_results.json +1 -0
  23. adapted_models/cpt+it/prop-0.25/opt_100-add/100M-ibo/config.json +31 -0
  24. adapted_models/cpt+it/prop-0.25/opt_100-add/100M-ibo/generation_config.json +7 -0
  25. adapted_models/cpt+it/prop-0.25/opt_100-add/100M-ibo/merges.txt +0 -0
  26. adapted_models/cpt+it/prop-0.25/opt_100-add/100M-ibo/pytorch_model.bin +3 -0
  27. adapted_models/cpt+it/prop-0.25/opt_100-add/100M-ibo/special_tokens_map.json +30 -0
  28. adapted_models/cpt+it/prop-0.25/opt_100-add/100M-ibo/tokenizer.json +0 -0
  29. adapted_models/cpt+it/prop-0.25/opt_100-add/100M-ibo/tokenizer_config.json +830 -0
  30. adapted_models/cpt+it/prop-0.25/opt_100-add/100M-ibo/vocab.json +0 -0
  31. adapted_models/cpt+it/prop-0.25/opt_100-add/100M-yor/added_tokens.json +99 -0
  32. adapted_models/cpt+it/prop-0.25/opt_100-add/100M-yor/all_results.json +1 -0
  33. adapted_models/cpt+it/prop-0.25/opt_100-add/100M-yor/config.json +31 -0
  34. adapted_models/cpt+it/prop-0.25/opt_100-add/100M-yor/generation_config.json +7 -0
  35. adapted_models/cpt+it/prop-0.25/opt_100-add/100M-yor/merges.txt +0 -0
  36. adapted_models/cpt+it/prop-0.25/opt_100-add/100M-yor/pytorch_model.bin +3 -0
  37. adapted_models/cpt+it/prop-0.25/opt_100-add/100M-yor/special_tokens_map.json +30 -0
  38. adapted_models/cpt+it/prop-0.25/opt_100-add/100M-yor/tokenizer.json +0 -0
  39. adapted_models/cpt+it/prop-0.25/opt_100-add/100M-yor/tokenizer_config.json +830 -0
  40. adapted_models/cpt+it/prop-0.25/opt_100-add/100M-yor/vocab.json +0 -0
  41. adapted_models/cpt+it/prop-0.25/opt_100-replace/100M-amh/all_results.json +1 -0
  42. adapted_models/cpt+it/prop-0.25/opt_100-replace/100M-amh/config.json +31 -0
  43. adapted_models/cpt+it/prop-0.25/opt_100-replace/100M-amh/generation_config.json +7 -0
  44. adapted_models/cpt+it/prop-0.25/opt_100-replace/100M-amh/merges.txt +0 -0
  45. adapted_models/cpt+it/prop-0.25/opt_100-replace/100M-amh/pytorch_model.bin +3 -0
  46. adapted_models/cpt+it/prop-0.25/opt_100-replace/100M-amh/special_tokens_map.json +30 -0
  47. adapted_models/cpt+it/prop-0.25/opt_100-replace/100M-amh/tokenizer.json +0 -0
  48. adapted_models/cpt+it/prop-0.25/opt_100-replace/100M-amh/tokenizer_config.json +30 -0
  49. adapted_models/cpt+it/prop-0.25/opt_100-replace/100M-amh/vocab.json +0 -0
  50. adapted_models/cpt+it/prop-0.25/opt_100-replace/100M-hau/all_results.json +1 -0
adapted_models/cpt+it/prop-0.25/opt_100-add/100M-amh/added_tokens.json ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ " ለ": 50313,
3
+ " ላይ": 50358,
4
+ " መ": 50287,
5
+ " ማ": 50343,
6
+ " ም": 50344,
7
+ " ሰ": 50349,
8
+ " ሲ": 50354,
9
+ " ስ": 50340,
10
+ " በ": 50271,
11
+ " ባ": 50359,
12
+ " ብ": 50351,
13
+ " ተ": 50301,
14
+ " ነ": 50308,
15
+ " ነው": 50346,
16
+ " አ": 50274,
17
+ " እ": 50280,
18
+ " እና": 50361,
19
+ " እን": 50304,
20
+ " እንደ": 50350,
21
+ " ከ": 50299,
22
+ " ወ": 50324,
23
+ " ው": 50353,
24
+ " የ": 50267,
25
+ " የሚ": 50332,
26
+ " የተ": 50342,
27
+ " ያ": 50327,
28
+ " ይ": 50319,
29
+ " ግ": 50355,
30
+ "ሁ": 50363,
31
+ "ህ": 50318,
32
+ "ሆ": 50328,
33
+ "ለ": 50275,
34
+ "ሉ": 50315,
35
+ "ላ": 50290,
36
+ "ል": 50270,
37
+ "መ": 50282,
38
+ "ሚ": 50298,
39
+ "ማ": 50296,
40
+ "ም": 50272,
41
+ "ሞ": 50360,
42
+ "ረ": 50285,
43
+ "ሩ": 50337,
44
+ "ሪ": 50302,
45
+ "ራ": 50288,
46
+ "ር": 50269,
47
+ "ሮ": 50335,
48
+ "ሰ": 50300,
49
+ "ሳ": 50321,
50
+ "ስ": 50273,
51
+ "ቀ": 50309,
52
+ "ቃ": 50347,
53
+ "ቅ": 50325,
54
+ "በ": 50292,
55
+ "ባ": 50297,
56
+ "ብ": 50295,
57
+ "ተ": 50277,
58
+ "ቱ": 50330,
59
+ "ታ": 50293,
60
+ "ት": 50266,
61
+ "ቶ": 50336,
62
+ "ቸ": 50306,
63
+ "ቸው": 50310,
64
+ "ች": 50276,
65
+ "ነ": 50294,
66
+ "ነት": 50364,
67
+ "ኑ": 50356,
68
+ "ና": 50278,
69
+ "ን": 50265,
70
+ "ኛ": 50338,
71
+ "አ": 50305,
72
+ "ከ": 50317,
73
+ "ካ": 50316,
74
+ "ክ": 50303,
75
+ "ወ": 50323,
76
+ "ዊ": 50345,
77
+ "ዋ": 50314,
78
+ "ዋል": 50348,
79
+ "ው": 50268,
80
+ "ዎች": 50333,
81
+ "ዚ": 50362,
82
+ "ዝ": 50357,
83
+ "የ": 50329,
84
+ "ያ": 50279,
85
+ "ይ": 50281,
86
+ "ዮ": 50352,
87
+ "ደ": 50284,
88
+ "ዲ": 50341,
89
+ "ዳ": 50307,
90
+ "ድ": 50289,
91
+ "ገ": 50286,
92
+ "ጋ": 50331,
93
+ "ግ": 50291,
94
+ "ጠ": 50320,
95
+ "ጣ": 50334,
96
+ "ጥ": 50311,
97
+ "ፈ": 50326,
98
+ "ፍ": 50312,
99
+ "፡": 50322,
100
+ "።": 50283,
101
+ "፣": 50339
102
+ }
adapted_models/cpt+it/prop-0.25/opt_100-add/100M-amh/all_results.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"perplexity": 1.114477056449803}
adapted_models/cpt+it/prop-0.25/opt_100-add/100M-amh/config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/SAN/intelsys/llm/aszablew/UCL_FYP/final_cpt_models/prop-0.25/opt_100-add/100M-amh",
3
+ "_remove_final_layer_norm": false,
4
+ "activation_dropout": 0.0,
5
+ "activation_function": "relu",
6
+ "architectures": [
7
+ "OPTForCausalLM"
8
+ ],
9
+ "attention_dropout": 0.0,
10
+ "bos_token_id": 2,
11
+ "do_layer_norm_before": true,
12
+ "dropout": 0.1,
13
+ "enable_bias": true,
14
+ "eos_token_id": 2,
15
+ "ffn_dim": 8192,
16
+ "hidden_size": 2048,
17
+ "init_std": 0.02,
18
+ "layer_norm_elementwise_affine": true,
19
+ "layerdrop": 0.0,
20
+ "max_position_embeddings": 2048,
21
+ "model_type": "opt",
22
+ "num_attention_heads": 32,
23
+ "num_hidden_layers": 24,
24
+ "pad_token_id": 1,
25
+ "prefix": "</s>",
26
+ "torch_dtype": "bfloat16",
27
+ "transformers_version": "4.39.3",
28
+ "use_cache": true,
29
+ "vocab_size": 50432,
30
+ "word_embed_proj_dim": 2048
31
+ }
adapted_models/cpt+it/prop-0.25/opt_100-add/100M-amh/generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 2,
4
+ "eos_token_id": 2,
5
+ "pad_token_id": 1,
6
+ "transformers_version": "4.39.3"
7
+ }
adapted_models/cpt+it/prop-0.25/opt_100-add/100M-amh/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
adapted_models/cpt+it/prop-0.25/opt_100-add/100M-amh/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6d1276d6f18f169baab7bfcb8b0602f710ebf7ee37cf651dfb74de9917fb573d
3
+ size 2838873246
adapted_models/cpt+it/prop-0.25/opt_100-add/100M-amh/special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "</s>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<pad>",
18
+ "lstrip": false,
19
+ "normalized": true,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "</s>",
25
+ "lstrip": false,
26
+ "normalized": true,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
adapted_models/cpt+it/prop-0.25/opt_100-add/100M-amh/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
adapted_models/cpt+it/prop-0.25/opt_100-add/100M-amh/tokenizer_config.json ADDED
@@ -0,0 +1,830 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "1": {
6
+ "content": "<pad>",
7
+ "lstrip": false,
8
+ "normalized": true,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "2": {
14
+ "content": "</s>",
15
+ "lstrip": false,
16
+ "normalized": true,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "50265": {
22
+ "content": "ን",
23
+ "lstrip": false,
24
+ "normalized": true,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": false
28
+ },
29
+ "50266": {
30
+ "content": "ት",
31
+ "lstrip": false,
32
+ "normalized": true,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": false
36
+ },
37
+ "50267": {
38
+ "content": " የ",
39
+ "lstrip": false,
40
+ "normalized": true,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": false
44
+ },
45
+ "50268": {
46
+ "content": "ው",
47
+ "lstrip": false,
48
+ "normalized": true,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": false
52
+ },
53
+ "50269": {
54
+ "content": "ር",
55
+ "lstrip": false,
56
+ "normalized": true,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": false
60
+ },
61
+ "50270": {
62
+ "content": "ል",
63
+ "lstrip": false,
64
+ "normalized": true,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": false
68
+ },
69
+ "50271": {
70
+ "content": " በ",
71
+ "lstrip": false,
72
+ "normalized": true,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": false
76
+ },
77
+ "50272": {
78
+ "content": "ም",
79
+ "lstrip": false,
80
+ "normalized": true,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": false
84
+ },
85
+ "50273": {
86
+ "content": "ስ",
87
+ "lstrip": false,
88
+ "normalized": true,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": false
92
+ },
93
+ "50274": {
94
+ "content": " አ",
95
+ "lstrip": false,
96
+ "normalized": true,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": false
100
+ },
101
+ "50275": {
102
+ "content": "ለ",
103
+ "lstrip": false,
104
+ "normalized": true,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": false
108
+ },
109
+ "50276": {
110
+ "content": "ች",
111
+ "lstrip": false,
112
+ "normalized": true,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": false
116
+ },
117
+ "50277": {
118
+ "content": "ተ",
119
+ "lstrip": false,
120
+ "normalized": true,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "50278": {
126
+ "content": "ና",
127
+ "lstrip": false,
128
+ "normalized": true,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "50279": {
134
+ "content": "ያ",
135
+ "lstrip": false,
136
+ "normalized": true,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "50280": {
142
+ "content": " እ",
143
+ "lstrip": false,
144
+ "normalized": true,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "50281": {
150
+ "content": "ይ",
151
+ "lstrip": false,
152
+ "normalized": true,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "50282": {
158
+ "content": "መ",
159
+ "lstrip": false,
160
+ "normalized": true,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "50283": {
166
+ "content": "።",
167
+ "lstrip": false,
168
+ "normalized": true,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "50284": {
174
+ "content": "ደ",
175
+ "lstrip": false,
176
+ "normalized": true,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ },
181
+ "50285": {
182
+ "content": "ረ",
183
+ "lstrip": false,
184
+ "normalized": true,
185
+ "rstrip": false,
186
+ "single_word": false,
187
+ "special": false
188
+ },
189
+ "50286": {
190
+ "content": "ገ",
191
+ "lstrip": false,
192
+ "normalized": true,
193
+ "rstrip": false,
194
+ "single_word": false,
195
+ "special": false
196
+ },
197
+ "50287": {
198
+ "content": " መ",
199
+ "lstrip": false,
200
+ "normalized": true,
201
+ "rstrip": false,
202
+ "single_word": false,
203
+ "special": false
204
+ },
205
+ "50288": {
206
+ "content": "ራ",
207
+ "lstrip": false,
208
+ "normalized": true,
209
+ "rstrip": false,
210
+ "single_word": false,
211
+ "special": false
212
+ },
213
+ "50289": {
214
+ "content": "ድ",
215
+ "lstrip": false,
216
+ "normalized": true,
217
+ "rstrip": false,
218
+ "single_word": false,
219
+ "special": false
220
+ },
221
+ "50290": {
222
+ "content": "ላ",
223
+ "lstrip": false,
224
+ "normalized": true,
225
+ "rstrip": false,
226
+ "single_word": false,
227
+ "special": false
228
+ },
229
+ "50291": {
230
+ "content": "ግ",
231
+ "lstrip": false,
232
+ "normalized": true,
233
+ "rstrip": false,
234
+ "single_word": false,
235
+ "special": false
236
+ },
237
+ "50292": {
238
+ "content": "በ",
239
+ "lstrip": false,
240
+ "normalized": true,
241
+ "rstrip": false,
242
+ "single_word": false,
243
+ "special": false
244
+ },
245
+ "50293": {
246
+ "content": "ታ",
247
+ "lstrip": false,
248
+ "normalized": true,
249
+ "rstrip": false,
250
+ "single_word": false,
251
+ "special": false
252
+ },
253
+ "50294": {
254
+ "content": "ነ",
255
+ "lstrip": false,
256
+ "normalized": true,
257
+ "rstrip": false,
258
+ "single_word": false,
259
+ "special": false
260
+ },
261
+ "50295": {
262
+ "content": "ብ",
263
+ "lstrip": false,
264
+ "normalized": true,
265
+ "rstrip": false,
266
+ "single_word": false,
267
+ "special": false
268
+ },
269
+ "50296": {
270
+ "content": "ማ",
271
+ "lstrip": false,
272
+ "normalized": true,
273
+ "rstrip": false,
274
+ "single_word": false,
275
+ "special": false
276
+ },
277
+ "50297": {
278
+ "content": "ባ",
279
+ "lstrip": false,
280
+ "normalized": true,
281
+ "rstrip": false,
282
+ "single_word": false,
283
+ "special": false
284
+ },
285
+ "50298": {
286
+ "content": "ሚ",
287
+ "lstrip": false,
288
+ "normalized": true,
289
+ "rstrip": false,
290
+ "single_word": false,
291
+ "special": false
292
+ },
293
+ "50299": {
294
+ "content": " ከ",
295
+ "lstrip": false,
296
+ "normalized": true,
297
+ "rstrip": false,
298
+ "single_word": false,
299
+ "special": false
300
+ },
301
+ "50300": {
302
+ "content": "ሰ",
303
+ "lstrip": false,
304
+ "normalized": true,
305
+ "rstrip": false,
306
+ "single_word": false,
307
+ "special": false
308
+ },
309
+ "50301": {
310
+ "content": " ተ",
311
+ "lstrip": false,
312
+ "normalized": true,
313
+ "rstrip": false,
314
+ "single_word": false,
315
+ "special": false
316
+ },
317
+ "50302": {
318
+ "content": "ሪ",
319
+ "lstrip": false,
320
+ "normalized": true,
321
+ "rstrip": false,
322
+ "single_word": false,
323
+ "special": false
324
+ },
325
+ "50303": {
326
+ "content": "ክ",
327
+ "lstrip": false,
328
+ "normalized": true,
329
+ "rstrip": false,
330
+ "single_word": false,
331
+ "special": false
332
+ },
333
+ "50304": {
334
+ "content": " እን",
335
+ "lstrip": false,
336
+ "normalized": true,
337
+ "rstrip": false,
338
+ "single_word": false,
339
+ "special": false
340
+ },
341
+ "50305": {
342
+ "content": "አ",
343
+ "lstrip": false,
344
+ "normalized": true,
345
+ "rstrip": false,
346
+ "single_word": false,
347
+ "special": false
348
+ },
349
+ "50306": {
350
+ "content": "ቸ",
351
+ "lstrip": false,
352
+ "normalized": true,
353
+ "rstrip": false,
354
+ "single_word": false,
355
+ "special": false
356
+ },
357
+ "50307": {
358
+ "content": "ዳ",
359
+ "lstrip": false,
360
+ "normalized": true,
361
+ "rstrip": false,
362
+ "single_word": false,
363
+ "special": false
364
+ },
365
+ "50308": {
366
+ "content": " ነ",
367
+ "lstrip": false,
368
+ "normalized": true,
369
+ "rstrip": false,
370
+ "single_word": false,
371
+ "special": false
372
+ },
373
+ "50309": {
374
+ "content": "ቀ",
375
+ "lstrip": false,
376
+ "normalized": true,
377
+ "rstrip": false,
378
+ "single_word": false,
379
+ "special": false
380
+ },
381
+ "50310": {
382
+ "content": "ቸው",
383
+ "lstrip": false,
384
+ "normalized": true,
385
+ "rstrip": false,
386
+ "single_word": false,
387
+ "special": false
388
+ },
389
+ "50311": {
390
+ "content": "ጥ",
391
+ "lstrip": false,
392
+ "normalized": true,
393
+ "rstrip": false,
394
+ "single_word": false,
395
+ "special": false
396
+ },
397
+ "50312": {
398
+ "content": "ፍ",
399
+ "lstrip": false,
400
+ "normalized": true,
401
+ "rstrip": false,
402
+ "single_word": false,
403
+ "special": false
404
+ },
405
+ "50313": {
406
+ "content": " ለ",
407
+ "lstrip": false,
408
+ "normalized": true,
409
+ "rstrip": false,
410
+ "single_word": false,
411
+ "special": false
412
+ },
413
+ "50314": {
414
+ "content": "ዋ",
415
+ "lstrip": false,
416
+ "normalized": true,
417
+ "rstrip": false,
418
+ "single_word": false,
419
+ "special": false
420
+ },
421
+ "50315": {
422
+ "content": "ሉ",
423
+ "lstrip": false,
424
+ "normalized": true,
425
+ "rstrip": false,
426
+ "single_word": false,
427
+ "special": false
428
+ },
429
+ "50316": {
430
+ "content": "ካ",
431
+ "lstrip": false,
432
+ "normalized": true,
433
+ "rstrip": false,
434
+ "single_word": false,
435
+ "special": false
436
+ },
437
+ "50317": {
438
+ "content": "ከ",
439
+ "lstrip": false,
440
+ "normalized": true,
441
+ "rstrip": false,
442
+ "single_word": false,
443
+ "special": false
444
+ },
445
+ "50318": {
446
+ "content": "ህ",
447
+ "lstrip": false,
448
+ "normalized": true,
449
+ "rstrip": false,
450
+ "single_word": false,
451
+ "special": false
452
+ },
453
+ "50319": {
454
+ "content": " ይ",
455
+ "lstrip": false,
456
+ "normalized": true,
457
+ "rstrip": false,
458
+ "single_word": false,
459
+ "special": false
460
+ },
461
+ "50320": {
462
+ "content": "ጠ",
463
+ "lstrip": false,
464
+ "normalized": true,
465
+ "rstrip": false,
466
+ "single_word": false,
467
+ "special": false
468
+ },
469
+ "50321": {
470
+ "content": "ሳ",
471
+ "lstrip": false,
472
+ "normalized": true,
473
+ "rstrip": false,
474
+ "single_word": false,
475
+ "special": false
476
+ },
477
+ "50322": {
478
+ "content": "፡",
479
+ "lstrip": false,
480
+ "normalized": true,
481
+ "rstrip": false,
482
+ "single_word": false,
483
+ "special": false
484
+ },
485
+ "50323": {
486
+ "content": "ወ",
487
+ "lstrip": false,
488
+ "normalized": true,
489
+ "rstrip": false,
490
+ "single_word": false,
491
+ "special": false
492
+ },
493
+ "50324": {
494
+ "content": " ወ",
495
+ "lstrip": false,
496
+ "normalized": true,
497
+ "rstrip": false,
498
+ "single_word": false,
499
+ "special": false
500
+ },
501
+ "50325": {
502
+ "content": "ቅ",
503
+ "lstrip": false,
504
+ "normalized": true,
505
+ "rstrip": false,
506
+ "single_word": false,
507
+ "special": false
508
+ },
509
+ "50326": {
510
+ "content": "ፈ",
511
+ "lstrip": false,
512
+ "normalized": true,
513
+ "rstrip": false,
514
+ "single_word": false,
515
+ "special": false
516
+ },
517
+ "50327": {
518
+ "content": " ያ",
519
+ "lstrip": false,
520
+ "normalized": true,
521
+ "rstrip": false,
522
+ "single_word": false,
523
+ "special": false
524
+ },
525
+ "50328": {
526
+ "content": "ሆ",
527
+ "lstrip": false,
528
+ "normalized": true,
529
+ "rstrip": false,
530
+ "single_word": false,
531
+ "special": false
532
+ },
533
+ "50329": {
534
+ "content": "የ",
535
+ "lstrip": false,
536
+ "normalized": true,
537
+ "rstrip": false,
538
+ "single_word": false,
539
+ "special": false
540
+ },
541
+ "50330": {
542
+ "content": "ቱ",
543
+ "lstrip": false,
544
+ "normalized": true,
545
+ "rstrip": false,
546
+ "single_word": false,
547
+ "special": false
548
+ },
549
+ "50331": {
550
+ "content": "ጋ",
551
+ "lstrip": false,
552
+ "normalized": true,
553
+ "rstrip": false,
554
+ "single_word": false,
555
+ "special": false
556
+ },
557
+ "50332": {
558
+ "content": " የሚ",
559
+ "lstrip": false,
560
+ "normalized": true,
561
+ "rstrip": false,
562
+ "single_word": false,
563
+ "special": false
564
+ },
565
+ "50333": {
566
+ "content": "ዎች",
567
+ "lstrip": false,
568
+ "normalized": true,
569
+ "rstrip": false,
570
+ "single_word": false,
571
+ "special": false
572
+ },
573
+ "50334": {
574
+ "content": "ጣ",
575
+ "lstrip": false,
576
+ "normalized": true,
577
+ "rstrip": false,
578
+ "single_word": false,
579
+ "special": false
580
+ },
581
+ "50335": {
582
+ "content": "ሮ",
583
+ "lstrip": false,
584
+ "normalized": true,
585
+ "rstrip": false,
586
+ "single_word": false,
587
+ "special": false
588
+ },
589
+ "50336": {
590
+ "content": "ቶ",
591
+ "lstrip": false,
592
+ "normalized": true,
593
+ "rstrip": false,
594
+ "single_word": false,
595
+ "special": false
596
+ },
597
+ "50337": {
598
+ "content": "ሩ",
599
+ "lstrip": false,
600
+ "normalized": true,
601
+ "rstrip": false,
602
+ "single_word": false,
603
+ "special": false
604
+ },
605
+ "50338": {
606
+ "content": "ኛ",
607
+ "lstrip": false,
608
+ "normalized": true,
609
+ "rstrip": false,
610
+ "single_word": false,
611
+ "special": false
612
+ },
613
+ "50339": {
614
+ "content": "፣",
615
+ "lstrip": false,
616
+ "normalized": true,
617
+ "rstrip": false,
618
+ "single_word": false,
619
+ "special": false
620
+ },
621
+ "50340": {
622
+ "content": " ስ",
623
+ "lstrip": false,
624
+ "normalized": true,
625
+ "rstrip": false,
626
+ "single_word": false,
627
+ "special": false
628
+ },
629
+ "50341": {
630
+ "content": "ዲ",
631
+ "lstrip": false,
632
+ "normalized": true,
633
+ "rstrip": false,
634
+ "single_word": false,
635
+ "special": false
636
+ },
637
+ "50342": {
638
+ "content": " የተ",
639
+ "lstrip": false,
640
+ "normalized": true,
641
+ "rstrip": false,
642
+ "single_word": false,
643
+ "special": false
644
+ },
645
+ "50343": {
646
+ "content": " ማ",
647
+ "lstrip": false,
648
+ "normalized": true,
649
+ "rstrip": false,
650
+ "single_word": false,
651
+ "special": false
652
+ },
653
+ "50344": {
654
+ "content": " ም",
655
+ "lstrip": false,
656
+ "normalized": true,
657
+ "rstrip": false,
658
+ "single_word": false,
659
+ "special": false
660
+ },
661
+ "50345": {
662
+ "content": "ዊ",
663
+ "lstrip": false,
664
+ "normalized": true,
665
+ "rstrip": false,
666
+ "single_word": false,
667
+ "special": false
668
+ },
669
+ "50346": {
670
+ "content": " ነው",
671
+ "lstrip": false,
672
+ "normalized": true,
673
+ "rstrip": false,
674
+ "single_word": false,
675
+ "special": false
676
+ },
677
+ "50347": {
678
+ "content": "ቃ",
679
+ "lstrip": false,
680
+ "normalized": true,
681
+ "rstrip": false,
682
+ "single_word": false,
683
+ "special": false
684
+ },
685
+ "50348": {
686
+ "content": "ዋል",
687
+ "lstrip": false,
688
+ "normalized": true,
689
+ "rstrip": false,
690
+ "single_word": false,
691
+ "special": false
692
+ },
693
+ "50349": {
694
+ "content": " ሰ",
695
+ "lstrip": false,
696
+ "normalized": true,
697
+ "rstrip": false,
698
+ "single_word": false,
699
+ "special": false
700
+ },
701
+ "50350": {
702
+ "content": " እንደ",
703
+ "lstrip": false,
704
+ "normalized": true,
705
+ "rstrip": false,
706
+ "single_word": false,
707
+ "special": false
708
+ },
709
+ "50351": {
710
+ "content": " ብ",
711
+ "lstrip": false,
712
+ "normalized": true,
713
+ "rstrip": false,
714
+ "single_word": false,
715
+ "special": false
716
+ },
717
+ "50352": {
718
+ "content": "ዮ",
719
+ "lstrip": false,
720
+ "normalized": true,
721
+ "rstrip": false,
722
+ "single_word": false,
723
+ "special": false
724
+ },
725
+ "50353": {
726
+ "content": " ው",
727
+ "lstrip": false,
728
+ "normalized": true,
729
+ "rstrip": false,
730
+ "single_word": false,
731
+ "special": false
732
+ },
733
+ "50354": {
734
+ "content": " ሲ",
735
+ "lstrip": false,
736
+ "normalized": true,
737
+ "rstrip": false,
738
+ "single_word": false,
739
+ "special": false
740
+ },
741
+ "50355": {
742
+ "content": " ግ",
743
+ "lstrip": false,
744
+ "normalized": true,
745
+ "rstrip": false,
746
+ "single_word": false,
747
+ "special": false
748
+ },
749
+ "50356": {
750
+ "content": "ኑ",
751
+ "lstrip": false,
752
+ "normalized": true,
753
+ "rstrip": false,
754
+ "single_word": false,
755
+ "special": false
756
+ },
757
+ "50357": {
758
+ "content": "ዝ",
759
+ "lstrip": false,
760
+ "normalized": true,
761
+ "rstrip": false,
762
+ "single_word": false,
763
+ "special": false
764
+ },
765
+ "50358": {
766
+ "content": " ላይ",
767
+ "lstrip": false,
768
+ "normalized": true,
769
+ "rstrip": false,
770
+ "single_word": false,
771
+ "special": false
772
+ },
773
+ "50359": {
774
+ "content": " ባ",
775
+ "lstrip": false,
776
+ "normalized": true,
777
+ "rstrip": false,
778
+ "single_word": false,
779
+ "special": false
780
+ },
781
+ "50360": {
782
+ "content": "ሞ",
783
+ "lstrip": false,
784
+ "normalized": true,
785
+ "rstrip": false,
786
+ "single_word": false,
787
+ "special": false
788
+ },
789
+ "50361": {
790
+ "content": " እና",
791
+ "lstrip": false,
792
+ "normalized": true,
793
+ "rstrip": false,
794
+ "single_word": false,
795
+ "special": false
796
+ },
797
+ "50362": {
798
+ "content": "ዚ",
799
+ "lstrip": false,
800
+ "normalized": true,
801
+ "rstrip": false,
802
+ "single_word": false,
803
+ "special": false
804
+ },
805
+ "50363": {
806
+ "content": "ሁ",
807
+ "lstrip": false,
808
+ "normalized": true,
809
+ "rstrip": false,
810
+ "single_word": false,
811
+ "special": false
812
+ },
813
+ "50364": {
814
+ "content": "ነት",
815
+ "lstrip": false,
816
+ "normalized": true,
817
+ "rstrip": false,
818
+ "single_word": false,
819
+ "special": false
820
+ }
821
+ },
822
+ "bos_token": "</s>",
823
+ "clean_up_tokenization_spaces": true,
824
+ "eos_token": "</s>",
825
+ "errors": "replace",
826
+ "model_max_length": 1000000000000000019884624838656,
827
+ "pad_token": "<pad>",
828
+ "tokenizer_class": "GPT2Tokenizer",
829
+ "unk_token": "</s>"
830
+ }
adapted_models/cpt+it/prop-0.25/opt_100-add/100M-amh/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
adapted_models/cpt+it/prop-0.25/opt_100-add/100M-hau/added_tokens.json ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ " Najeriya": 50318,
3
+ " abin": 50337,
4
+ " ake": 50341,
5
+ " amf": 50359,
6
+ " bayan": 50300,
7
+ " bayyana": 50343,
8
+ " biy": 50349,
9
+ " cewa": 50281,
10
+ " ci": 50299,
11
+ " cikin": 50278,
12
+ " daga": 50282,
13
+ " dai": 50312,
14
+ " duk": 50314,
15
+ " gaba": 50311,
16
+ " gi": 50320,
17
+ " gwamnat": 50353,
18
+ " haka": 50336,
19
+ " han": 50289,
20
+ " huk": 50342,
21
+ " inda": 50348,
22
+ " iya": 50307,
23
+ " ji": 50287,
24
+ " jihar": 50319,
25
+ " kai": 50316,
26
+ " kar": 50279,
27
+ " kara": 50331,
28
+ " kasa": 50296,
29
+ " kasar": 50297,
30
+ " ku": 50265,
31
+ " kuma": 50272,
32
+ " kwa": 50347,
33
+ " kya": 50364,
34
+ " loka": 50324,
35
+ " lokacin": 50356,
36
+ " mai": 50286,
37
+ " masu": 50304,
38
+ " mata": 50308,
39
+ " matsa": 50333,
40
+ " mutan": 50332,
41
+ " mutane": 50360,
42
+ " ranar": 50344,
43
+ " sai": 50317,
44
+ " samu": 50345,
45
+ " sha": 50292,
46
+ " shekar": 50352,
47
+ " shi": 50275,
48
+ " shugaban": 50322,
49
+ " suka": 50283,
50
+ " suna": 50329,
51
+ " tare": 50321,
52
+ " tsa": 50303,
53
+ " tsar": 50338,
54
+ " wadan": 50350,
55
+ " wajen": 50357,
56
+ " wan": 50268,
57
+ " wanda": 50302,
58
+ " wani": 50291,
59
+ " wannan": 50288,
60
+ " wasu": 50323,
61
+ " wata": 50339,
62
+ " yadda": 50328,
63
+ " yan": 50274,
64
+ " yana": 50310,
65
+ " yi": 50271,
66
+ " yin": 50363,
67
+ " za": 50266,
68
+ " zai": 50315,
69
+ " zuwa": 50313,
70
+ " ƙ": 50298,
71
+ " ‘": 50325,
72
+ " “": 50334,
73
+ "anar": 50305,
74
+ "anin": 50327,
75
+ "bba": 50354,
76
+ "dda": 50290,
77
+ "ikin": 50273,
78
+ "iyar": 50284,
79
+ "iyyar": 50358,
80
+ "jeriya": 50309,
81
+ "kara": 50361,
82
+ "kon": 50355,
83
+ "kwa": 50335,
84
+ "sar": 50269,
85
+ "shen": 50351,
86
+ "shin": 50294,
87
+ "tsa": 50306,
88
+ "ugaban": 50295,
89
+ "ungiyar": 50362,
90
+ "uwa": 50276,
91
+ "wam": 50285,
92
+ "wamn": 50301,
93
+ "wamnat": 50326,
94
+ "yar": 50267,
95
+ "yin": 50277,
96
+ "yyan": 50330,
97
+ "yyana": 50340,
98
+ "ƙ": 50280,
99
+ "ɗ": 50293,
100
+ "’": 50270,
101
+ "”": 50346
102
+ }
adapted_models/cpt+it/prop-0.25/opt_100-add/100M-hau/all_results.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"perplexity": 1.1826008059495123}
adapted_models/cpt+it/prop-0.25/opt_100-add/100M-hau/config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/SAN/intelsys/llm/aszablew/UCL_FYP/final_cpt_models/prop-0.25/opt_100-add/100M-hau",
3
+ "_remove_final_layer_norm": false,
4
+ "activation_dropout": 0.0,
5
+ "activation_function": "relu",
6
+ "architectures": [
7
+ "OPTForCausalLM"
8
+ ],
9
+ "attention_dropout": 0.0,
10
+ "bos_token_id": 2,
11
+ "do_layer_norm_before": true,
12
+ "dropout": 0.1,
13
+ "enable_bias": true,
14
+ "eos_token_id": 2,
15
+ "ffn_dim": 8192,
16
+ "hidden_size": 2048,
17
+ "init_std": 0.02,
18
+ "layer_norm_elementwise_affine": true,
19
+ "layerdrop": 0.0,
20
+ "max_position_embeddings": 2048,
21
+ "model_type": "opt",
22
+ "num_attention_heads": 32,
23
+ "num_hidden_layers": 24,
24
+ "pad_token_id": 1,
25
+ "prefix": "</s>",
26
+ "torch_dtype": "bfloat16",
27
+ "transformers_version": "4.39.3",
28
+ "use_cache": true,
29
+ "vocab_size": 50432,
30
+ "word_embed_proj_dim": 2048
31
+ }
adapted_models/cpt+it/prop-0.25/opt_100-add/100M-hau/generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 2,
4
+ "eos_token_id": 2,
5
+ "pad_token_id": 1,
6
+ "transformers_version": "4.39.3"
7
+ }
adapted_models/cpt+it/prop-0.25/opt_100-add/100M-hau/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
adapted_models/cpt+it/prop-0.25/opt_100-add/100M-hau/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a4f44622f2478b8e2937a69a4362333e30234f950570c6af8866b619aa19271d
3
+ size 2838873246
adapted_models/cpt+it/prop-0.25/opt_100-add/100M-hau/special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "</s>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<pad>",
18
+ "lstrip": false,
19
+ "normalized": true,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "</s>",
25
+ "lstrip": false,
26
+ "normalized": true,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
adapted_models/cpt+it/prop-0.25/opt_100-add/100M-hau/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
adapted_models/cpt+it/prop-0.25/opt_100-add/100M-hau/tokenizer_config.json ADDED
@@ -0,0 +1,830 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "1": {
6
+ "content": "<pad>",
7
+ "lstrip": false,
8
+ "normalized": true,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "2": {
14
+ "content": "</s>",
15
+ "lstrip": false,
16
+ "normalized": true,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "50265": {
22
+ "content": " ku",
23
+ "lstrip": false,
24
+ "normalized": true,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": false
28
+ },
29
+ "50266": {
30
+ "content": " za",
31
+ "lstrip": false,
32
+ "normalized": true,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": false
36
+ },
37
+ "50267": {
38
+ "content": "yar",
39
+ "lstrip": false,
40
+ "normalized": true,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": false
44
+ },
45
+ "50268": {
46
+ "content": " wan",
47
+ "lstrip": false,
48
+ "normalized": true,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": false
52
+ },
53
+ "50269": {
54
+ "content": "sar",
55
+ "lstrip": false,
56
+ "normalized": true,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": false
60
+ },
61
+ "50270": {
62
+ "content": "’",
63
+ "lstrip": false,
64
+ "normalized": true,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": false
68
+ },
69
+ "50271": {
70
+ "content": " yi",
71
+ "lstrip": false,
72
+ "normalized": true,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": false
76
+ },
77
+ "50272": {
78
+ "content": " kuma",
79
+ "lstrip": false,
80
+ "normalized": true,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": false
84
+ },
85
+ "50273": {
86
+ "content": "ikin",
87
+ "lstrip": false,
88
+ "normalized": true,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": false
92
+ },
93
+ "50274": {
94
+ "content": " yan",
95
+ "lstrip": false,
96
+ "normalized": true,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": false
100
+ },
101
+ "50275": {
102
+ "content": " shi",
103
+ "lstrip": false,
104
+ "normalized": true,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": false
108
+ },
109
+ "50276": {
110
+ "content": "uwa",
111
+ "lstrip": false,
112
+ "normalized": true,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": false
116
+ },
117
+ "50277": {
118
+ "content": "yin",
119
+ "lstrip": false,
120
+ "normalized": true,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "50278": {
126
+ "content": " cikin",
127
+ "lstrip": false,
128
+ "normalized": true,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "50279": {
134
+ "content": " kar",
135
+ "lstrip": false,
136
+ "normalized": true,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "50280": {
142
+ "content": "ƙ",
143
+ "lstrip": false,
144
+ "normalized": true,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "50281": {
150
+ "content": " cewa",
151
+ "lstrip": false,
152
+ "normalized": true,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "50282": {
158
+ "content": " daga",
159
+ "lstrip": false,
160
+ "normalized": true,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "50283": {
166
+ "content": " suka",
167
+ "lstrip": false,
168
+ "normalized": true,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "50284": {
174
+ "content": "iyar",
175
+ "lstrip": false,
176
+ "normalized": true,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ },
181
+ "50285": {
182
+ "content": "wam",
183
+ "lstrip": false,
184
+ "normalized": true,
185
+ "rstrip": false,
186
+ "single_word": false,
187
+ "special": false
188
+ },
189
+ "50286": {
190
+ "content": " mai",
191
+ "lstrip": false,
192
+ "normalized": true,
193
+ "rstrip": false,
194
+ "single_word": false,
195
+ "special": false
196
+ },
197
+ "50287": {
198
+ "content": " ji",
199
+ "lstrip": false,
200
+ "normalized": true,
201
+ "rstrip": false,
202
+ "single_word": false,
203
+ "special": false
204
+ },
205
+ "50288": {
206
+ "content": " wannan",
207
+ "lstrip": false,
208
+ "normalized": true,
209
+ "rstrip": false,
210
+ "single_word": false,
211
+ "special": false
212
+ },
213
+ "50289": {
214
+ "content": " han",
215
+ "lstrip": false,
216
+ "normalized": true,
217
+ "rstrip": false,
218
+ "single_word": false,
219
+ "special": false
220
+ },
221
+ "50290": {
222
+ "content": "dda",
223
+ "lstrip": false,
224
+ "normalized": true,
225
+ "rstrip": false,
226
+ "single_word": false,
227
+ "special": false
228
+ },
229
+ "50291": {
230
+ "content": " wani",
231
+ "lstrip": false,
232
+ "normalized": true,
233
+ "rstrip": false,
234
+ "single_word": false,
235
+ "special": false
236
+ },
237
+ "50292": {
238
+ "content": " sha",
239
+ "lstrip": false,
240
+ "normalized": true,
241
+ "rstrip": false,
242
+ "single_word": false,
243
+ "special": false
244
+ },
245
+ "50293": {
246
+ "content": "ɗ",
247
+ "lstrip": false,
248
+ "normalized": true,
249
+ "rstrip": false,
250
+ "single_word": false,
251
+ "special": false
252
+ },
253
+ "50294": {
254
+ "content": "shin",
255
+ "lstrip": false,
256
+ "normalized": true,
257
+ "rstrip": false,
258
+ "single_word": false,
259
+ "special": false
260
+ },
261
+ "50295": {
262
+ "content": "ugaban",
263
+ "lstrip": false,
264
+ "normalized": true,
265
+ "rstrip": false,
266
+ "single_word": false,
267
+ "special": false
268
+ },
269
+ "50296": {
270
+ "content": " kasa",
271
+ "lstrip": false,
272
+ "normalized": true,
273
+ "rstrip": false,
274
+ "single_word": false,
275
+ "special": false
276
+ },
277
+ "50297": {
278
+ "content": " kasar",
279
+ "lstrip": false,
280
+ "normalized": true,
281
+ "rstrip": false,
282
+ "single_word": false,
283
+ "special": false
284
+ },
285
+ "50298": {
286
+ "content": " ƙ",
287
+ "lstrip": false,
288
+ "normalized": true,
289
+ "rstrip": false,
290
+ "single_word": false,
291
+ "special": false
292
+ },
293
+ "50299": {
294
+ "content": " ci",
295
+ "lstrip": false,
296
+ "normalized": true,
297
+ "rstrip": false,
298
+ "single_word": false,
299
+ "special": false
300
+ },
301
+ "50300": {
302
+ "content": " bayan",
303
+ "lstrip": false,
304
+ "normalized": true,
305
+ "rstrip": false,
306
+ "single_word": false,
307
+ "special": false
308
+ },
309
+ "50301": {
310
+ "content": "wamn",
311
+ "lstrip": false,
312
+ "normalized": true,
313
+ "rstrip": false,
314
+ "single_word": false,
315
+ "special": false
316
+ },
317
+ "50302": {
318
+ "content": " wanda",
319
+ "lstrip": false,
320
+ "normalized": true,
321
+ "rstrip": false,
322
+ "single_word": false,
323
+ "special": false
324
+ },
325
+ "50303": {
326
+ "content": " tsa",
327
+ "lstrip": false,
328
+ "normalized": true,
329
+ "rstrip": false,
330
+ "single_word": false,
331
+ "special": false
332
+ },
333
+ "50304": {
334
+ "content": " masu",
335
+ "lstrip": false,
336
+ "normalized": true,
337
+ "rstrip": false,
338
+ "single_word": false,
339
+ "special": false
340
+ },
341
+ "50305": {
342
+ "content": "anar",
343
+ "lstrip": false,
344
+ "normalized": true,
345
+ "rstrip": false,
346
+ "single_word": false,
347
+ "special": false
348
+ },
349
+ "50306": {
350
+ "content": "tsa",
351
+ "lstrip": false,
352
+ "normalized": true,
353
+ "rstrip": false,
354
+ "single_word": false,
355
+ "special": false
356
+ },
357
+ "50307": {
358
+ "content": " iya",
359
+ "lstrip": false,
360
+ "normalized": true,
361
+ "rstrip": false,
362
+ "single_word": false,
363
+ "special": false
364
+ },
365
+ "50308": {
366
+ "content": " mata",
367
+ "lstrip": false,
368
+ "normalized": true,
369
+ "rstrip": false,
370
+ "single_word": false,
371
+ "special": false
372
+ },
373
+ "50309": {
374
+ "content": "jeriya",
375
+ "lstrip": false,
376
+ "normalized": true,
377
+ "rstrip": false,
378
+ "single_word": false,
379
+ "special": false
380
+ },
381
+ "50310": {
382
+ "content": " yana",
383
+ "lstrip": false,
384
+ "normalized": true,
385
+ "rstrip": false,
386
+ "single_word": false,
387
+ "special": false
388
+ },
389
+ "50311": {
390
+ "content": " gaba",
391
+ "lstrip": false,
392
+ "normalized": true,
393
+ "rstrip": false,
394
+ "single_word": false,
395
+ "special": false
396
+ },
397
+ "50312": {
398
+ "content": " dai",
399
+ "lstrip": false,
400
+ "normalized": true,
401
+ "rstrip": false,
402
+ "single_word": false,
403
+ "special": false
404
+ },
405
+ "50313": {
406
+ "content": " zuwa",
407
+ "lstrip": false,
408
+ "normalized": true,
409
+ "rstrip": false,
410
+ "single_word": false,
411
+ "special": false
412
+ },
413
+ "50314": {
414
+ "content": " duk",
415
+ "lstrip": false,
416
+ "normalized": true,
417
+ "rstrip": false,
418
+ "single_word": false,
419
+ "special": false
420
+ },
421
+ "50315": {
422
+ "content": " zai",
423
+ "lstrip": false,
424
+ "normalized": true,
425
+ "rstrip": false,
426
+ "single_word": false,
427
+ "special": false
428
+ },
429
+ "50316": {
430
+ "content": " kai",
431
+ "lstrip": false,
432
+ "normalized": true,
433
+ "rstrip": false,
434
+ "single_word": false,
435
+ "special": false
436
+ },
437
+ "50317": {
438
+ "content": " sai",
439
+ "lstrip": false,
440
+ "normalized": true,
441
+ "rstrip": false,
442
+ "single_word": false,
443
+ "special": false
444
+ },
445
+ "50318": {
446
+ "content": " Najeriya",
447
+ "lstrip": false,
448
+ "normalized": true,
449
+ "rstrip": false,
450
+ "single_word": false,
451
+ "special": false
452
+ },
453
+ "50319": {
454
+ "content": " jihar",
455
+ "lstrip": false,
456
+ "normalized": true,
457
+ "rstrip": false,
458
+ "single_word": false,
459
+ "special": false
460
+ },
461
+ "50320": {
462
+ "content": " gi",
463
+ "lstrip": false,
464
+ "normalized": true,
465
+ "rstrip": false,
466
+ "single_word": false,
467
+ "special": false
468
+ },
469
+ "50321": {
470
+ "content": " tare",
471
+ "lstrip": false,
472
+ "normalized": true,
473
+ "rstrip": false,
474
+ "single_word": false,
475
+ "special": false
476
+ },
477
+ "50322": {
478
+ "content": " shugaban",
479
+ "lstrip": false,
480
+ "normalized": true,
481
+ "rstrip": false,
482
+ "single_word": false,
483
+ "special": false
484
+ },
485
+ "50323": {
486
+ "content": " wasu",
487
+ "lstrip": false,
488
+ "normalized": true,
489
+ "rstrip": false,
490
+ "single_word": false,
491
+ "special": false
492
+ },
493
+ "50324": {
494
+ "content": " loka",
495
+ "lstrip": false,
496
+ "normalized": true,
497
+ "rstrip": false,
498
+ "single_word": false,
499
+ "special": false
500
+ },
501
+ "50325": {
502
+ "content": " ‘",
503
+ "lstrip": false,
504
+ "normalized": true,
505
+ "rstrip": false,
506
+ "single_word": false,
507
+ "special": false
508
+ },
509
+ "50326": {
510
+ "content": "wamnat",
511
+ "lstrip": false,
512
+ "normalized": true,
513
+ "rstrip": false,
514
+ "single_word": false,
515
+ "special": false
516
+ },
517
+ "50327": {
518
+ "content": "anin",
519
+ "lstrip": false,
520
+ "normalized": true,
521
+ "rstrip": false,
522
+ "single_word": false,
523
+ "special": false
524
+ },
525
+ "50328": {
526
+ "content": " yadda",
527
+ "lstrip": false,
528
+ "normalized": true,
529
+ "rstrip": false,
530
+ "single_word": false,
531
+ "special": false
532
+ },
533
+ "50329": {
534
+ "content": " suna",
535
+ "lstrip": false,
536
+ "normalized": true,
537
+ "rstrip": false,
538
+ "single_word": false,
539
+ "special": false
540
+ },
541
+ "50330": {
542
+ "content": "yyan",
543
+ "lstrip": false,
544
+ "normalized": true,
545
+ "rstrip": false,
546
+ "single_word": false,
547
+ "special": false
548
+ },
549
+ "50331": {
550
+ "content": " kara",
551
+ "lstrip": false,
552
+ "normalized": true,
553
+ "rstrip": false,
554
+ "single_word": false,
555
+ "special": false
556
+ },
557
+ "50332": {
558
+ "content": " mutan",
559
+ "lstrip": false,
560
+ "normalized": true,
561
+ "rstrip": false,
562
+ "single_word": false,
563
+ "special": false
564
+ },
565
+ "50333": {
566
+ "content": " matsa",
567
+ "lstrip": false,
568
+ "normalized": true,
569
+ "rstrip": false,
570
+ "single_word": false,
571
+ "special": false
572
+ },
573
+ "50334": {
574
+ "content": " “",
575
+ "lstrip": false,
576
+ "normalized": true,
577
+ "rstrip": false,
578
+ "single_word": false,
579
+ "special": false
580
+ },
581
+ "50335": {
582
+ "content": "kwa",
583
+ "lstrip": false,
584
+ "normalized": true,
585
+ "rstrip": false,
586
+ "single_word": false,
587
+ "special": false
588
+ },
589
+ "50336": {
590
+ "content": " haka",
591
+ "lstrip": false,
592
+ "normalized": true,
593
+ "rstrip": false,
594
+ "single_word": false,
595
+ "special": false
596
+ },
597
+ "50337": {
598
+ "content": " abin",
599
+ "lstrip": false,
600
+ "normalized": true,
601
+ "rstrip": false,
602
+ "single_word": false,
603
+ "special": false
604
+ },
605
+ "50338": {
606
+ "content": " tsar",
607
+ "lstrip": false,
608
+ "normalized": true,
609
+ "rstrip": false,
610
+ "single_word": false,
611
+ "special": false
612
+ },
613
+ "50339": {
614
+ "content": " wata",
615
+ "lstrip": false,
616
+ "normalized": true,
617
+ "rstrip": false,
618
+ "single_word": false,
619
+ "special": false
620
+ },
621
+ "50340": {
622
+ "content": "yyana",
623
+ "lstrip": false,
624
+ "normalized": true,
625
+ "rstrip": false,
626
+ "single_word": false,
627
+ "special": false
628
+ },
629
+ "50341": {
630
+ "content": " ake",
631
+ "lstrip": false,
632
+ "normalized": true,
633
+ "rstrip": false,
634
+ "single_word": false,
635
+ "special": false
636
+ },
637
+ "50342": {
638
+ "content": " huk",
639
+ "lstrip": false,
640
+ "normalized": true,
641
+ "rstrip": false,
642
+ "single_word": false,
643
+ "special": false
644
+ },
645
+ "50343": {
646
+ "content": " bayyana",
647
+ "lstrip": false,
648
+ "normalized": true,
649
+ "rstrip": false,
650
+ "single_word": false,
651
+ "special": false
652
+ },
653
+ "50344": {
654
+ "content": " ranar",
655
+ "lstrip": false,
656
+ "normalized": true,
657
+ "rstrip": false,
658
+ "single_word": false,
659
+ "special": false
660
+ },
661
+ "50345": {
662
+ "content": " samu",
663
+ "lstrip": false,
664
+ "normalized": true,
665
+ "rstrip": false,
666
+ "single_word": false,
667
+ "special": false
668
+ },
669
+ "50346": {
670
+ "content": "”",
671
+ "lstrip": false,
672
+ "normalized": true,
673
+ "rstrip": false,
674
+ "single_word": false,
675
+ "special": false
676
+ },
677
+ "50347": {
678
+ "content": " kwa",
679
+ "lstrip": false,
680
+ "normalized": true,
681
+ "rstrip": false,
682
+ "single_word": false,
683
+ "special": false
684
+ },
685
+ "50348": {
686
+ "content": " inda",
687
+ "lstrip": false,
688
+ "normalized": true,
689
+ "rstrip": false,
690
+ "single_word": false,
691
+ "special": false
692
+ },
693
+ "50349": {
694
+ "content": " biy",
695
+ "lstrip": false,
696
+ "normalized": true,
697
+ "rstrip": false,
698
+ "single_word": false,
699
+ "special": false
700
+ },
701
+ "50350": {
702
+ "content": " wadan",
703
+ "lstrip": false,
704
+ "normalized": true,
705
+ "rstrip": false,
706
+ "single_word": false,
707
+ "special": false
708
+ },
709
+ "50351": {
710
+ "content": "shen",
711
+ "lstrip": false,
712
+ "normalized": true,
713
+ "rstrip": false,
714
+ "single_word": false,
715
+ "special": false
716
+ },
717
+ "50352": {
718
+ "content": " shekar",
719
+ "lstrip": false,
720
+ "normalized": true,
721
+ "rstrip": false,
722
+ "single_word": false,
723
+ "special": false
724
+ },
725
+ "50353": {
726
+ "content": " gwamnat",
727
+ "lstrip": false,
728
+ "normalized": true,
729
+ "rstrip": false,
730
+ "single_word": false,
731
+ "special": false
732
+ },
733
+ "50354": {
734
+ "content": "bba",
735
+ "lstrip": false,
736
+ "normalized": true,
737
+ "rstrip": false,
738
+ "single_word": false,
739
+ "special": false
740
+ },
741
+ "50355": {
742
+ "content": "kon",
743
+ "lstrip": false,
744
+ "normalized": true,
745
+ "rstrip": false,
746
+ "single_word": false,
747
+ "special": false
748
+ },
749
+ "50356": {
750
+ "content": " lokacin",
751
+ "lstrip": false,
752
+ "normalized": true,
753
+ "rstrip": false,
754
+ "single_word": false,
755
+ "special": false
756
+ },
757
+ "50357": {
758
+ "content": " wajen",
759
+ "lstrip": false,
760
+ "normalized": true,
761
+ "rstrip": false,
762
+ "single_word": false,
763
+ "special": false
764
+ },
765
+ "50358": {
766
+ "content": "iyyar",
767
+ "lstrip": false,
768
+ "normalized": true,
769
+ "rstrip": false,
770
+ "single_word": false,
771
+ "special": false
772
+ },
773
+ "50359": {
774
+ "content": " amf",
775
+ "lstrip": false,
776
+ "normalized": true,
777
+ "rstrip": false,
778
+ "single_word": false,
779
+ "special": false
780
+ },
781
+ "50360": {
782
+ "content": " mutane",
783
+ "lstrip": false,
784
+ "normalized": true,
785
+ "rstrip": false,
786
+ "single_word": false,
787
+ "special": false
788
+ },
789
+ "50361": {
790
+ "content": "kara",
791
+ "lstrip": false,
792
+ "normalized": true,
793
+ "rstrip": false,
794
+ "single_word": false,
795
+ "special": false
796
+ },
797
+ "50362": {
798
+ "content": "ungiyar",
799
+ "lstrip": false,
800
+ "normalized": true,
801
+ "rstrip": false,
802
+ "single_word": false,
803
+ "special": false
804
+ },
805
+ "50363": {
806
+ "content": " yin",
807
+ "lstrip": false,
808
+ "normalized": true,
809
+ "rstrip": false,
810
+ "single_word": false,
811
+ "special": false
812
+ },
813
+ "50364": {
814
+ "content": " kya",
815
+ "lstrip": false,
816
+ "normalized": true,
817
+ "rstrip": false,
818
+ "single_word": false,
819
+ "special": false
820
+ }
821
+ },
822
+ "bos_token": "</s>",
823
+ "clean_up_tokenization_spaces": true,
824
+ "eos_token": "</s>",
825
+ "errors": "replace",
826
+ "model_max_length": 1000000000000000019884624838656,
827
+ "pad_token": "<pad>",
828
+ "tokenizer_class": "GPT2Tokenizer",
829
+ "unk_token": "</s>"
830
+ }
adapted_models/cpt+it/prop-0.25/opt_100-add/100M-hau/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
adapted_models/cpt+it/prop-0.25/opt_100-add/100M-ibo/added_tokens.json ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ " abụ": 50330,
3
+ " abụọ": 50348,
4
+ " afọ": 50314,
5
+ " ahụ": 50287,
6
+ " akwụkwọ": 50334,
7
+ " akụ": 50351,
8
+ " bụ": 50274,
9
+ " dị": 50278,
10
+ " ebe": 50332,
11
+ " ihe": 50280,
12
+ " iji": 50357,
13
+ " ike": 50345,
14
+ " iri": 50353,
15
+ " isi": 50322,
16
+ " kw": 50335,
17
+ " maka": 50298,
18
+ " mba": 50341,
19
+ " mgbe": 50346,
20
+ " mkp": 50363,
21
+ " mma": 50306,
22
+ " mmadụ": 50344,
23
+ " mme": 50343,
24
+ " ndị": 50275,
25
+ " nk": 50326,
26
+ " nke": 50271,
27
+ " nkiri": 50364,
28
+ " nw": 50303,
29
+ " nwa": 50321,
30
+ " nwere": 50329,
31
+ " ny": 50359,
32
+ " onye": 50281,
33
+ " otu": 50308,
34
+ " ị": 50294,
35
+ " Ọ": 50301,
36
+ " ọ": 50268,
37
+ " ọn": 50339,
38
+ " ọrụ": 50292,
39
+ " ọzọ": 50352,
40
+ " ụ": 50276,
41
+ " ụlọ": 50315,
42
+ " ụmụ": 50337,
43
+ "afọ": 50318,
44
+ "arị": 50361,
45
+ "bodo": 50360,
46
+ "bụ": 50270,
47
+ "chị": 50297,
48
+ "dị": 50269,
49
+ "dụ": 50290,
50
+ "eti": 50328,
51
+ "fọ": 50285,
52
+ "gba": 50295,
53
+ "gbe": 50310,
54
+ "gbo": 50355,
55
+ "gbu": 50342,
56
+ "ghị": 50305,
57
+ "gw": 50320,
58
+ "gwu": 50312,
59
+ "gụ": 50350,
60
+ "hụ": 50279,
61
+ "kp": 50277,
62
+ "kpe": 50349,
63
+ "kwa": 50288,
64
+ "kwu": 50291,
65
+ "kwọ": 50309,
66
+ "kwụ": 50300,
67
+ "kwụkwọ": 50311,
68
+ "kọ": 50284,
69
+ "kụ": 50296,
70
+ "lọ": 50293,
71
+ "mụ": 50302,
72
+ "nwe": 50354,
73
+ "nya": 50331,
74
+ "nye": 50273,
75
+ "nyere": 50324,
76
+ "nyị": 50325,
77
+ "pụ": 50282,
78
+ "pụta": 50316,
79
+ "pụtara": 50327,
80
+ "rị": 50283,
81
+ "rịa": 50338,
82
+ "rọ": 50304,
83
+ "rụ": 50272,
84
+ "sị": 50317,
85
+ "sụ": 50358,
86
+ "tara": 50299,
87
+ "tere": 50347,
88
+ "tọ": 50323,
89
+ "tụ": 50286,
90
+ "tụtụ": 50356,
91
+ "zọ": 50313,
92
+ "zụ": 50336,
93
+ "ị": 50267,
94
+ "ịa": 50319,
95
+ "Ọ": 50289,
96
+ "ọ": 50266,
97
+ "ọrọ": 50307,
98
+ "ọọ": 50340,
99
+ "ụ": 50265,
100
+ "ụrụ": 50333,
101
+ "’": 50362
102
+ }
adapted_models/cpt+it/prop-0.25/opt_100-add/100M-ibo/all_results.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"perplexity": 1.2453361236055542}
adapted_models/cpt+it/prop-0.25/opt_100-add/100M-ibo/config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/SAN/intelsys/llm/aszablew/UCL_FYP/final_cpt_models/prop-0.25/opt_100-add/100M-ibo",
3
+ "_remove_final_layer_norm": false,
4
+ "activation_dropout": 0.0,
5
+ "activation_function": "relu",
6
+ "architectures": [
7
+ "OPTForCausalLM"
8
+ ],
9
+ "attention_dropout": 0.0,
10
+ "bos_token_id": 2,
11
+ "do_layer_norm_before": true,
12
+ "dropout": 0.1,
13
+ "enable_bias": true,
14
+ "eos_token_id": 2,
15
+ "ffn_dim": 8192,
16
+ "hidden_size": 2048,
17
+ "init_std": 0.02,
18
+ "layer_norm_elementwise_affine": true,
19
+ "layerdrop": 0.0,
20
+ "max_position_embeddings": 2048,
21
+ "model_type": "opt",
22
+ "num_attention_heads": 32,
23
+ "num_hidden_layers": 24,
24
+ "pad_token_id": 1,
25
+ "prefix": "</s>",
26
+ "torch_dtype": "bfloat16",
27
+ "transformers_version": "4.39.3",
28
+ "use_cache": true,
29
+ "vocab_size": 50432,
30
+ "word_embed_proj_dim": 2048
31
+ }
adapted_models/cpt+it/prop-0.25/opt_100-add/100M-ibo/generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 2,
4
+ "eos_token_id": 2,
5
+ "pad_token_id": 1,
6
+ "transformers_version": "4.39.3"
7
+ }
adapted_models/cpt+it/prop-0.25/opt_100-add/100M-ibo/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
adapted_models/cpt+it/prop-0.25/opt_100-add/100M-ibo/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:49a2a505061f45e7208fa635cf004ac5868a856b09638d829cdb9b28ca6de687
3
+ size 2838873246
adapted_models/cpt+it/prop-0.25/opt_100-add/100M-ibo/special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "</s>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<pad>",
18
+ "lstrip": false,
19
+ "normalized": true,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "</s>",
25
+ "lstrip": false,
26
+ "normalized": true,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
adapted_models/cpt+it/prop-0.25/opt_100-add/100M-ibo/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
adapted_models/cpt+it/prop-0.25/opt_100-add/100M-ibo/tokenizer_config.json ADDED
@@ -0,0 +1,830 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "1": {
6
+ "content": "<pad>",
7
+ "lstrip": false,
8
+ "normalized": true,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "2": {
14
+ "content": "</s>",
15
+ "lstrip": false,
16
+ "normalized": true,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "50265": {
22
+ "content": "ụ",
23
+ "lstrip": false,
24
+ "normalized": true,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": false
28
+ },
29
+ "50266": {
30
+ "content": "ọ",
31
+ "lstrip": false,
32
+ "normalized": true,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": false
36
+ },
37
+ "50267": {
38
+ "content": "ị",
39
+ "lstrip": false,
40
+ "normalized": true,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": false
44
+ },
45
+ "50268": {
46
+ "content": " ọ",
47
+ "lstrip": false,
48
+ "normalized": true,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": false
52
+ },
53
+ "50269": {
54
+ "content": "dị",
55
+ "lstrip": false,
56
+ "normalized": true,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": false
60
+ },
61
+ "50270": {
62
+ "content": "bụ",
63
+ "lstrip": false,
64
+ "normalized": true,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": false
68
+ },
69
+ "50271": {
70
+ "content": " nke",
71
+ "lstrip": false,
72
+ "normalized": true,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": false
76
+ },
77
+ "50272": {
78
+ "content": "rụ",
79
+ "lstrip": false,
80
+ "normalized": true,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": false
84
+ },
85
+ "50273": {
86
+ "content": "nye",
87
+ "lstrip": false,
88
+ "normalized": true,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": false
92
+ },
93
+ "50274": {
94
+ "content": " bụ",
95
+ "lstrip": false,
96
+ "normalized": true,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": false
100
+ },
101
+ "50275": {
102
+ "content": " ndị",
103
+ "lstrip": false,
104
+ "normalized": true,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": false
108
+ },
109
+ "50276": {
110
+ "content": " ụ",
111
+ "lstrip": false,
112
+ "normalized": true,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": false
116
+ },
117
+ "50277": {
118
+ "content": "kp",
119
+ "lstrip": false,
120
+ "normalized": true,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "50278": {
126
+ "content": " dị",
127
+ "lstrip": false,
128
+ "normalized": true,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "50279": {
134
+ "content": "hụ",
135
+ "lstrip": false,
136
+ "normalized": true,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "50280": {
142
+ "content": " ihe",
143
+ "lstrip": false,
144
+ "normalized": true,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "50281": {
150
+ "content": " onye",
151
+ "lstrip": false,
152
+ "normalized": true,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "50282": {
158
+ "content": "pụ",
159
+ "lstrip": false,
160
+ "normalized": true,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "50283": {
166
+ "content": "rị",
167
+ "lstrip": false,
168
+ "normalized": true,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "50284": {
174
+ "content": "kọ",
175
+ "lstrip": false,
176
+ "normalized": true,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ },
181
+ "50285": {
182
+ "content": "fọ",
183
+ "lstrip": false,
184
+ "normalized": true,
185
+ "rstrip": false,
186
+ "single_word": false,
187
+ "special": false
188
+ },
189
+ "50286": {
190
+ "content": "tụ",
191
+ "lstrip": false,
192
+ "normalized": true,
193
+ "rstrip": false,
194
+ "single_word": false,
195
+ "special": false
196
+ },
197
+ "50287": {
198
+ "content": " ahụ",
199
+ "lstrip": false,
200
+ "normalized": true,
201
+ "rstrip": false,
202
+ "single_word": false,
203
+ "special": false
204
+ },
205
+ "50288": {
206
+ "content": "kwa",
207
+ "lstrip": false,
208
+ "normalized": true,
209
+ "rstrip": false,
210
+ "single_word": false,
211
+ "special": false
212
+ },
213
+ "50289": {
214
+ "content": "Ọ",
215
+ "lstrip": false,
216
+ "normalized": true,
217
+ "rstrip": false,
218
+ "single_word": false,
219
+ "special": false
220
+ },
221
+ "50290": {
222
+ "content": "dụ",
223
+ "lstrip": false,
224
+ "normalized": true,
225
+ "rstrip": false,
226
+ "single_word": false,
227
+ "special": false
228
+ },
229
+ "50291": {
230
+ "content": "kwu",
231
+ "lstrip": false,
232
+ "normalized": true,
233
+ "rstrip": false,
234
+ "single_word": false,
235
+ "special": false
236
+ },
237
+ "50292": {
238
+ "content": " ọrụ",
239
+ "lstrip": false,
240
+ "normalized": true,
241
+ "rstrip": false,
242
+ "single_word": false,
243
+ "special": false
244
+ },
245
+ "50293": {
246
+ "content": "lọ",
247
+ "lstrip": false,
248
+ "normalized": true,
249
+ "rstrip": false,
250
+ "single_word": false,
251
+ "special": false
252
+ },
253
+ "50294": {
254
+ "content": " ị",
255
+ "lstrip": false,
256
+ "normalized": true,
257
+ "rstrip": false,
258
+ "single_word": false,
259
+ "special": false
260
+ },
261
+ "50295": {
262
+ "content": "gba",
263
+ "lstrip": false,
264
+ "normalized": true,
265
+ "rstrip": false,
266
+ "single_word": false,
267
+ "special": false
268
+ },
269
+ "50296": {
270
+ "content": "kụ",
271
+ "lstrip": false,
272
+ "normalized": true,
273
+ "rstrip": false,
274
+ "single_word": false,
275
+ "special": false
276
+ },
277
+ "50297": {
278
+ "content": "chị",
279
+ "lstrip": false,
280
+ "normalized": true,
281
+ "rstrip": false,
282
+ "single_word": false,
283
+ "special": false
284
+ },
285
+ "50298": {
286
+ "content": " maka",
287
+ "lstrip": false,
288
+ "normalized": true,
289
+ "rstrip": false,
290
+ "single_word": false,
291
+ "special": false
292
+ },
293
+ "50299": {
294
+ "content": "tara",
295
+ "lstrip": false,
296
+ "normalized": true,
297
+ "rstrip": false,
298
+ "single_word": false,
299
+ "special": false
300
+ },
301
+ "50300": {
302
+ "content": "kwụ",
303
+ "lstrip": false,
304
+ "normalized": true,
305
+ "rstrip": false,
306
+ "single_word": false,
307
+ "special": false
308
+ },
309
+ "50301": {
310
+ "content": " Ọ",
311
+ "lstrip": false,
312
+ "normalized": true,
313
+ "rstrip": false,
314
+ "single_word": false,
315
+ "special": false
316
+ },
317
+ "50302": {
318
+ "content": "mụ",
319
+ "lstrip": false,
320
+ "normalized": true,
321
+ "rstrip": false,
322
+ "single_word": false,
323
+ "special": false
324
+ },
325
+ "50303": {
326
+ "content": " nw",
327
+ "lstrip": false,
328
+ "normalized": true,
329
+ "rstrip": false,
330
+ "single_word": false,
331
+ "special": false
332
+ },
333
+ "50304": {
334
+ "content": "rọ",
335
+ "lstrip": false,
336
+ "normalized": true,
337
+ "rstrip": false,
338
+ "single_word": false,
339
+ "special": false
340
+ },
341
+ "50305": {
342
+ "content": "ghị",
343
+ "lstrip": false,
344
+ "normalized": true,
345
+ "rstrip": false,
346
+ "single_word": false,
347
+ "special": false
348
+ },
349
+ "50306": {
350
+ "content": " mma",
351
+ "lstrip": false,
352
+ "normalized": true,
353
+ "rstrip": false,
354
+ "single_word": false,
355
+ "special": false
356
+ },
357
+ "50307": {
358
+ "content": "ọrọ",
359
+ "lstrip": false,
360
+ "normalized": true,
361
+ "rstrip": false,
362
+ "single_word": false,
363
+ "special": false
364
+ },
365
+ "50308": {
366
+ "content": " otu",
367
+ "lstrip": false,
368
+ "normalized": true,
369
+ "rstrip": false,
370
+ "single_word": false,
371
+ "special": false
372
+ },
373
+ "50309": {
374
+ "content": "kwọ",
375
+ "lstrip": false,
376
+ "normalized": true,
377
+ "rstrip": false,
378
+ "single_word": false,
379
+ "special": false
380
+ },
381
+ "50310": {
382
+ "content": "gbe",
383
+ "lstrip": false,
384
+ "normalized": true,
385
+ "rstrip": false,
386
+ "single_word": false,
387
+ "special": false
388
+ },
389
+ "50311": {
390
+ "content": "kwụkwọ",
391
+ "lstrip": false,
392
+ "normalized": true,
393
+ "rstrip": false,
394
+ "single_word": false,
395
+ "special": false
396
+ },
397
+ "50312": {
398
+ "content": "gwu",
399
+ "lstrip": false,
400
+ "normalized": true,
401
+ "rstrip": false,
402
+ "single_word": false,
403
+ "special": false
404
+ },
405
+ "50313": {
406
+ "content": "zọ",
407
+ "lstrip": false,
408
+ "normalized": true,
409
+ "rstrip": false,
410
+ "single_word": false,
411
+ "special": false
412
+ },
413
+ "50314": {
414
+ "content": " afọ",
415
+ "lstrip": false,
416
+ "normalized": true,
417
+ "rstrip": false,
418
+ "single_word": false,
419
+ "special": false
420
+ },
421
+ "50315": {
422
+ "content": " ụlọ",
423
+ "lstrip": false,
424
+ "normalized": true,
425
+ "rstrip": false,
426
+ "single_word": false,
427
+ "special": false
428
+ },
429
+ "50316": {
430
+ "content": "pụta",
431
+ "lstrip": false,
432
+ "normalized": true,
433
+ "rstrip": false,
434
+ "single_word": false,
435
+ "special": false
436
+ },
437
+ "50317": {
438
+ "content": "sị",
439
+ "lstrip": false,
440
+ "normalized": true,
441
+ "rstrip": false,
442
+ "single_word": false,
443
+ "special": false
444
+ },
445
+ "50318": {
446
+ "content": "afọ",
447
+ "lstrip": false,
448
+ "normalized": true,
449
+ "rstrip": false,
450
+ "single_word": false,
451
+ "special": false
452
+ },
453
+ "50319": {
454
+ "content": "ịa",
455
+ "lstrip": false,
456
+ "normalized": true,
457
+ "rstrip": false,
458
+ "single_word": false,
459
+ "special": false
460
+ },
461
+ "50320": {
462
+ "content": "gw",
463
+ "lstrip": false,
464
+ "normalized": true,
465
+ "rstrip": false,
466
+ "single_word": false,
467
+ "special": false
468
+ },
469
+ "50321": {
470
+ "content": " nwa",
471
+ "lstrip": false,
472
+ "normalized": true,
473
+ "rstrip": false,
474
+ "single_word": false,
475
+ "special": false
476
+ },
477
+ "50322": {
478
+ "content": " isi",
479
+ "lstrip": false,
480
+ "normalized": true,
481
+ "rstrip": false,
482
+ "single_word": false,
483
+ "special": false
484
+ },
485
+ "50323": {
486
+ "content": "tọ",
487
+ "lstrip": false,
488
+ "normalized": true,
489
+ "rstrip": false,
490
+ "single_word": false,
491
+ "special": false
492
+ },
493
+ "50324": {
494
+ "content": "nyere",
495
+ "lstrip": false,
496
+ "normalized": true,
497
+ "rstrip": false,
498
+ "single_word": false,
499
+ "special": false
500
+ },
501
+ "50325": {
502
+ "content": "nyị",
503
+ "lstrip": false,
504
+ "normalized": true,
505
+ "rstrip": false,
506
+ "single_word": false,
507
+ "special": false
508
+ },
509
+ "50326": {
510
+ "content": " nk",
511
+ "lstrip": false,
512
+ "normalized": true,
513
+ "rstrip": false,
514
+ "single_word": false,
515
+ "special": false
516
+ },
517
+ "50327": {
518
+ "content": "pụtara",
519
+ "lstrip": false,
520
+ "normalized": true,
521
+ "rstrip": false,
522
+ "single_word": false,
523
+ "special": false
524
+ },
525
+ "50328": {
526
+ "content": "eti",
527
+ "lstrip": false,
528
+ "normalized": true,
529
+ "rstrip": false,
530
+ "single_word": false,
531
+ "special": false
532
+ },
533
+ "50329": {
534
+ "content": " nwere",
535
+ "lstrip": false,
536
+ "normalized": true,
537
+ "rstrip": false,
538
+ "single_word": false,
539
+ "special": false
540
+ },
541
+ "50330": {
542
+ "content": " abụ",
543
+ "lstrip": false,
544
+ "normalized": true,
545
+ "rstrip": false,
546
+ "single_word": false,
547
+ "special": false
548
+ },
549
+ "50331": {
550
+ "content": "nya",
551
+ "lstrip": false,
552
+ "normalized": true,
553
+ "rstrip": false,
554
+ "single_word": false,
555
+ "special": false
556
+ },
557
+ "50332": {
558
+ "content": " ebe",
559
+ "lstrip": false,
560
+ "normalized": true,
561
+ "rstrip": false,
562
+ "single_word": false,
563
+ "special": false
564
+ },
565
+ "50333": {
566
+ "content": "ụrụ",
567
+ "lstrip": false,
568
+ "normalized": true,
569
+ "rstrip": false,
570
+ "single_word": false,
571
+ "special": false
572
+ },
573
+ "50334": {
574
+ "content": " akwụkwọ",
575
+ "lstrip": false,
576
+ "normalized": true,
577
+ "rstrip": false,
578
+ "single_word": false,
579
+ "special": false
580
+ },
581
+ "50335": {
582
+ "content": " kw",
583
+ "lstrip": false,
584
+ "normalized": true,
585
+ "rstrip": false,
586
+ "single_word": false,
587
+ "special": false
588
+ },
589
+ "50336": {
590
+ "content": "zụ",
591
+ "lstrip": false,
592
+ "normalized": true,
593
+ "rstrip": false,
594
+ "single_word": false,
595
+ "special": false
596
+ },
597
+ "50337": {
598
+ "content": " ụmụ",
599
+ "lstrip": false,
600
+ "normalized": true,
601
+ "rstrip": false,
602
+ "single_word": false,
603
+ "special": false
604
+ },
605
+ "50338": {
606
+ "content": "rịa",
607
+ "lstrip": false,
608
+ "normalized": true,
609
+ "rstrip": false,
610
+ "single_word": false,
611
+ "special": false
612
+ },
613
+ "50339": {
614
+ "content": " ọn",
615
+ "lstrip": false,
616
+ "normalized": true,
617
+ "rstrip": false,
618
+ "single_word": false,
619
+ "special": false
620
+ },
621
+ "50340": {
622
+ "content": "ọọ",
623
+ "lstrip": false,
624
+ "normalized": true,
625
+ "rstrip": false,
626
+ "single_word": false,
627
+ "special": false
628
+ },
629
+ "50341": {
630
+ "content": " mba",
631
+ "lstrip": false,
632
+ "normalized": true,
633
+ "rstrip": false,
634
+ "single_word": false,
635
+ "special": false
636
+ },
637
+ "50342": {
638
+ "content": "gbu",
639
+ "lstrip": false,
640
+ "normalized": true,
641
+ "rstrip": false,
642
+ "single_word": false,
643
+ "special": false
644
+ },
645
+ "50343": {
646
+ "content": " mme",
647
+ "lstrip": false,
648
+ "normalized": true,
649
+ "rstrip": false,
650
+ "single_word": false,
651
+ "special": false
652
+ },
653
+ "50344": {
654
+ "content": " mmadụ",
655
+ "lstrip": false,
656
+ "normalized": true,
657
+ "rstrip": false,
658
+ "single_word": false,
659
+ "special": false
660
+ },
661
+ "50345": {
662
+ "content": " ike",
663
+ "lstrip": false,
664
+ "normalized": true,
665
+ "rstrip": false,
666
+ "single_word": false,
667
+ "special": false
668
+ },
669
+ "50346": {
670
+ "content": " mgbe",
671
+ "lstrip": false,
672
+ "normalized": true,
673
+ "rstrip": false,
674
+ "single_word": false,
675
+ "special": false
676
+ },
677
+ "50347": {
678
+ "content": "tere",
679
+ "lstrip": false,
680
+ "normalized": true,
681
+ "rstrip": false,
682
+ "single_word": false,
683
+ "special": false
684
+ },
685
+ "50348": {
686
+ "content": " abụọ",
687
+ "lstrip": false,
688
+ "normalized": true,
689
+ "rstrip": false,
690
+ "single_word": false,
691
+ "special": false
692
+ },
693
+ "50349": {
694
+ "content": "kpe",
695
+ "lstrip": false,
696
+ "normalized": true,
697
+ "rstrip": false,
698
+ "single_word": false,
699
+ "special": false
700
+ },
701
+ "50350": {
702
+ "content": "gụ",
703
+ "lstrip": false,
704
+ "normalized": true,
705
+ "rstrip": false,
706
+ "single_word": false,
707
+ "special": false
708
+ },
709
+ "50351": {
710
+ "content": " akụ",
711
+ "lstrip": false,
712
+ "normalized": true,
713
+ "rstrip": false,
714
+ "single_word": false,
715
+ "special": false
716
+ },
717
+ "50352": {
718
+ "content": " ọzọ",
719
+ "lstrip": false,
720
+ "normalized": true,
721
+ "rstrip": false,
722
+ "single_word": false,
723
+ "special": false
724
+ },
725
+ "50353": {
726
+ "content": " iri",
727
+ "lstrip": false,
728
+ "normalized": true,
729
+ "rstrip": false,
730
+ "single_word": false,
731
+ "special": false
732
+ },
733
+ "50354": {
734
+ "content": "nwe",
735
+ "lstrip": false,
736
+ "normalized": true,
737
+ "rstrip": false,
738
+ "single_word": false,
739
+ "special": false
740
+ },
741
+ "50355": {
742
+ "content": "gbo",
743
+ "lstrip": false,
744
+ "normalized": true,
745
+ "rstrip": false,
746
+ "single_word": false,
747
+ "special": false
748
+ },
749
+ "50356": {
750
+ "content": "tụtụ",
751
+ "lstrip": false,
752
+ "normalized": true,
753
+ "rstrip": false,
754
+ "single_word": false,
755
+ "special": false
756
+ },
757
+ "50357": {
758
+ "content": " iji",
759
+ "lstrip": false,
760
+ "normalized": true,
761
+ "rstrip": false,
762
+ "single_word": false,
763
+ "special": false
764
+ },
765
+ "50358": {
766
+ "content": "sụ",
767
+ "lstrip": false,
768
+ "normalized": true,
769
+ "rstrip": false,
770
+ "single_word": false,
771
+ "special": false
772
+ },
773
+ "50359": {
774
+ "content": " ny",
775
+ "lstrip": false,
776
+ "normalized": true,
777
+ "rstrip": false,
778
+ "single_word": false,
779
+ "special": false
780
+ },
781
+ "50360": {
782
+ "content": "bodo",
783
+ "lstrip": false,
784
+ "normalized": true,
785
+ "rstrip": false,
786
+ "single_word": false,
787
+ "special": false
788
+ },
789
+ "50361": {
790
+ "content": "arị",
791
+ "lstrip": false,
792
+ "normalized": true,
793
+ "rstrip": false,
794
+ "single_word": false,
795
+ "special": false
796
+ },
797
+ "50362": {
798
+ "content": "’",
799
+ "lstrip": false,
800
+ "normalized": true,
801
+ "rstrip": false,
802
+ "single_word": false,
803
+ "special": false
804
+ },
805
+ "50363": {
806
+ "content": " mkp",
807
+ "lstrip": false,
808
+ "normalized": true,
809
+ "rstrip": false,
810
+ "single_word": false,
811
+ "special": false
812
+ },
813
+ "50364": {
814
+ "content": " nkiri",
815
+ "lstrip": false,
816
+ "normalized": true,
817
+ "rstrip": false,
818
+ "single_word": false,
819
+ "special": false
820
+ }
821
+ },
822
+ "bos_token": "</s>",
823
+ "clean_up_tokenization_spaces": true,
824
+ "eos_token": "</s>",
825
+ "errors": "replace",
826
+ "model_max_length": 1000000000000000019884624838656,
827
+ "pad_token": "<pad>",
828
+ "tokenizer_class": "GPT2Tokenizer",
829
+ "unk_token": "</s>"
830
+ }
adapted_models/cpt+it/prop-0.25/opt_100-add/100M-ibo/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
adapted_models/cpt+it/prop-0.25/opt_100-add/100M-yor/added_tokens.json ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ " Awọn": 50326,
3
+ " ala": 50350,
4
+ " ara": 50339,
5
+ " ati": 50275,
6
+ " awọn": 50271,
7
+ " gb": 50274,
8
+ " gba": 50318,
9
+ " gbo": 50322,
10
+ " gbogbo": 50331,
11
+ " ile": 50319,
12
+ " iṣẹ": 50345,
13
+ " jẹ": 50290,
14
+ " kọ": 50355,
15
+ " lati": 50282,
16
+ " lori": 50321,
17
+ " lá": 50332,
18
+ " ló": 50335,
19
+ " lẹ": 50316,
20
+ " lọ": 50296,
21
+ " máa": 50344,
22
+ " mẹ": 50343,
23
+ " mọ": 50334,
24
+ " naa": 50297,
25
+ " ninu": 50320,
26
+ " nipa": 50333,
27
+ " ní": 50289,
28
+ " ohun": 50311,
29
+ " pé": 50317,
30
+ " pẹ": 50308,
31
+ " pẹlu": 50315,
32
+ " rẹ": 50292,
33
+ " sí": 50337,
34
+ " sọ": 50304,
35
+ " tabi": 50360,
36
+ " tí": 50307,
37
+ " tó": 50305,
38
+ " wọn": 50281,
39
+ " yii": 50341,
40
+ " yoo": 50327,
41
+ " Ì": 50361,
42
+ " àwọn": 50309,
43
+ " è": 50325,
44
+ " ì": 50287,
45
+ " ò": 50313,
46
+ " ó": 50348,
47
+ " ń": 50312,
48
+ " ṣ": 50273,
49
+ " ṣe": 50278,
50
+ " ẹ": 50279,
51
+ " Ọ": 50301,
52
+ " ọ": 50272,
53
+ " ọmọ": 50330,
54
+ " “": 50351,
55
+ "bẹ": 50340,
56
+ "dun": 50338,
57
+ "gba": 50283,
58
+ "gbe": 50294,
59
+ "gbo": 50295,
60
+ "gbà": 50356,
61
+ "gbẹ": 50336,
62
+ "iṣẹ": 50353,
63
+ "jọ": 50288,
64
+ "kọ": 50286,
65
+ "lẹ": 50277,
66
+ "lọ": 50293,
67
+ "mọ": 50284,
68
+ "pọ": 50298,
69
+ "rí": 50310,
70
+ "rẹ": 50314,
71
+ "rọ": 50300,
72
+ "sẹ": 50323,
73
+ "tẹ": 50358,
74
+ "tọ": 50324,
75
+ "wọ": 50299,
76
+ "wọn": 50268,
77
+ "yin": 50328,
78
+ "yẹ": 50349,
79
+ "àn": 50303,
80
+ "áa": 50329,
81
+ "áà": 50347,
82
+ "ìn": 50352,
83
+ "ùn": 50357,
84
+ "ún": 50302,
85
+ "̀": 50280,
86
+ "́": 50276,
87
+ "̣": 50270,
88
+ "Ṣ": 50354,
89
+ "ṣ": 50269,
90
+ "ṣe": 50306,
91
+ "ṣẹ": 50285,
92
+ "Ẹ": 50346,
93
+ "ẹ": 50266,
94
+ "Ọ": 50291,
95
+ "ọ": 50265,
96
+ "ọn": 50267,
97
+ "ọna": 50359,
98
+ "”": 50342
99
+ }
adapted_models/cpt+it/prop-0.25/opt_100-add/100M-yor/all_results.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"perplexity": 1.8150009037188144}
adapted_models/cpt+it/prop-0.25/opt_100-add/100M-yor/config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/SAN/intelsys/llm/aszablew/UCL_FYP/final_cpt_models/prop-0.25/opt_100-add/100M-yor",
3
+ "_remove_final_layer_norm": false,
4
+ "activation_dropout": 0.0,
5
+ "activation_function": "relu",
6
+ "architectures": [
7
+ "OPTForCausalLM"
8
+ ],
9
+ "attention_dropout": 0.0,
10
+ "bos_token_id": 2,
11
+ "do_layer_norm_before": true,
12
+ "dropout": 0.1,
13
+ "enable_bias": true,
14
+ "eos_token_id": 2,
15
+ "ffn_dim": 8192,
16
+ "hidden_size": 2048,
17
+ "init_std": 0.02,
18
+ "layer_norm_elementwise_affine": true,
19
+ "layerdrop": 0.0,
20
+ "max_position_embeddings": 2048,
21
+ "model_type": "opt",
22
+ "num_attention_heads": 32,
23
+ "num_hidden_layers": 24,
24
+ "pad_token_id": 1,
25
+ "prefix": "</s>",
26
+ "torch_dtype": "bfloat16",
27
+ "transformers_version": "4.39.3",
28
+ "use_cache": true,
29
+ "vocab_size": 50432,
30
+ "word_embed_proj_dim": 2048
31
+ }
adapted_models/cpt+it/prop-0.25/opt_100-add/100M-yor/generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 2,
4
+ "eos_token_id": 2,
5
+ "pad_token_id": 1,
6
+ "transformers_version": "4.39.3"
7
+ }
adapted_models/cpt+it/prop-0.25/opt_100-add/100M-yor/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
adapted_models/cpt+it/prop-0.25/opt_100-add/100M-yor/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5e82bae6fc31987f8b54aba592ccb4d2f2765cc1e6d1e98b5c2e11aa27f779bb
3
+ size 2838873246
adapted_models/cpt+it/prop-0.25/opt_100-add/100M-yor/special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "</s>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<pad>",
18
+ "lstrip": false,
19
+ "normalized": true,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "</s>",
25
+ "lstrip": false,
26
+ "normalized": true,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
adapted_models/cpt+it/prop-0.25/opt_100-add/100M-yor/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
adapted_models/cpt+it/prop-0.25/opt_100-add/100M-yor/tokenizer_config.json ADDED
@@ -0,0 +1,830 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "1": {
6
+ "content": "<pad>",
7
+ "lstrip": false,
8
+ "normalized": true,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "2": {
14
+ "content": "</s>",
15
+ "lstrip": false,
16
+ "normalized": true,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "43998": {
22
+ "content": "ì",
23
+ "lstrip": false,
24
+ "normalized": true,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": false
28
+ },
29
+ "50096": {
30
+ "content": "ò",
31
+ "lstrip": false,
32
+ "normalized": true,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": false
36
+ },
37
+ "50101": {
38
+ "content": "ù",
39
+ "lstrip": false,
40
+ "normalized": true,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": false
44
+ },
45
+ "50265": {
46
+ "content": "ọ",
47
+ "lstrip": false,
48
+ "normalized": true,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": false
52
+ },
53
+ "50266": {
54
+ "content": "ẹ",
55
+ "lstrip": false,
56
+ "normalized": true,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": false
60
+ },
61
+ "50267": {
62
+ "content": "ọn",
63
+ "lstrip": false,
64
+ "normalized": true,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": false
68
+ },
69
+ "50268": {
70
+ "content": "wọn",
71
+ "lstrip": false,
72
+ "normalized": true,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": false
76
+ },
77
+ "50269": {
78
+ "content": "ṣ",
79
+ "lstrip": false,
80
+ "normalized": true,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": false
84
+ },
85
+ "50270": {
86
+ "content": "̣",
87
+ "lstrip": false,
88
+ "normalized": true,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": false
92
+ },
93
+ "50271": {
94
+ "content": " awọn",
95
+ "lstrip": false,
96
+ "normalized": true,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": false
100
+ },
101
+ "50272": {
102
+ "content": " ọ",
103
+ "lstrip": false,
104
+ "normalized": true,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": false
108
+ },
109
+ "50273": {
110
+ "content": " ṣ",
111
+ "lstrip": false,
112
+ "normalized": true,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": false
116
+ },
117
+ "50274": {
118
+ "content": " gb",
119
+ "lstrip": false,
120
+ "normalized": true,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "50275": {
126
+ "content": " ati",
127
+ "lstrip": false,
128
+ "normalized": true,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "50276": {
134
+ "content": "́",
135
+ "lstrip": false,
136
+ "normalized": true,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "50277": {
142
+ "content": "lẹ",
143
+ "lstrip": false,
144
+ "normalized": true,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "50278": {
150
+ "content": " ṣe",
151
+ "lstrip": false,
152
+ "normalized": true,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "50279": {
158
+ "content": " ẹ",
159
+ "lstrip": false,
160
+ "normalized": true,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "50280": {
166
+ "content": "̀",
167
+ "lstrip": false,
168
+ "normalized": true,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "50281": {
174
+ "content": " wọn",
175
+ "lstrip": false,
176
+ "normalized": true,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ },
181
+ "50282": {
182
+ "content": " lati",
183
+ "lstrip": false,
184
+ "normalized": true,
185
+ "rstrip": false,
186
+ "single_word": false,
187
+ "special": false
188
+ },
189
+ "50283": {
190
+ "content": "gba",
191
+ "lstrip": false,
192
+ "normalized": true,
193
+ "rstrip": false,
194
+ "single_word": false,
195
+ "special": false
196
+ },
197
+ "50284": {
198
+ "content": "mọ",
199
+ "lstrip": false,
200
+ "normalized": true,
201
+ "rstrip": false,
202
+ "single_word": false,
203
+ "special": false
204
+ },
205
+ "50285": {
206
+ "content": "ṣẹ",
207
+ "lstrip": false,
208
+ "normalized": true,
209
+ "rstrip": false,
210
+ "single_word": false,
211
+ "special": false
212
+ },
213
+ "50286": {
214
+ "content": "kọ",
215
+ "lstrip": false,
216
+ "normalized": true,
217
+ "rstrip": false,
218
+ "single_word": false,
219
+ "special": false
220
+ },
221
+ "50287": {
222
+ "content": " ì",
223
+ "lstrip": false,
224
+ "normalized": true,
225
+ "rstrip": false,
226
+ "single_word": false,
227
+ "special": false
228
+ },
229
+ "50288": {
230
+ "content": "jọ",
231
+ "lstrip": false,
232
+ "normalized": true,
233
+ "rstrip": false,
234
+ "single_word": false,
235
+ "special": false
236
+ },
237
+ "50289": {
238
+ "content": " ní",
239
+ "lstrip": false,
240
+ "normalized": true,
241
+ "rstrip": false,
242
+ "single_word": false,
243
+ "special": false
244
+ },
245
+ "50290": {
246
+ "content": " jẹ",
247
+ "lstrip": false,
248
+ "normalized": true,
249
+ "rstrip": false,
250
+ "single_word": false,
251
+ "special": false
252
+ },
253
+ "50291": {
254
+ "content": "Ọ",
255
+ "lstrip": false,
256
+ "normalized": true,
257
+ "rstrip": false,
258
+ "single_word": false,
259
+ "special": false
260
+ },
261
+ "50292": {
262
+ "content": " rẹ",
263
+ "lstrip": false,
264
+ "normalized": true,
265
+ "rstrip": false,
266
+ "single_word": false,
267
+ "special": false
268
+ },
269
+ "50293": {
270
+ "content": "lọ",
271
+ "lstrip": false,
272
+ "normalized": true,
273
+ "rstrip": false,
274
+ "single_word": false,
275
+ "special": false
276
+ },
277
+ "50294": {
278
+ "content": "gbe",
279
+ "lstrip": false,
280
+ "normalized": true,
281
+ "rstrip": false,
282
+ "single_word": false,
283
+ "special": false
284
+ },
285
+ "50295": {
286
+ "content": "gbo",
287
+ "lstrip": false,
288
+ "normalized": true,
289
+ "rstrip": false,
290
+ "single_word": false,
291
+ "special": false
292
+ },
293
+ "50296": {
294
+ "content": " lọ",
295
+ "lstrip": false,
296
+ "normalized": true,
297
+ "rstrip": false,
298
+ "single_word": false,
299
+ "special": false
300
+ },
301
+ "50297": {
302
+ "content": " naa",
303
+ "lstrip": false,
304
+ "normalized": true,
305
+ "rstrip": false,
306
+ "single_word": false,
307
+ "special": false
308
+ },
309
+ "50298": {
310
+ "content": "pọ",
311
+ "lstrip": false,
312
+ "normalized": true,
313
+ "rstrip": false,
314
+ "single_word": false,
315
+ "special": false
316
+ },
317
+ "50299": {
318
+ "content": "wọ",
319
+ "lstrip": false,
320
+ "normalized": true,
321
+ "rstrip": false,
322
+ "single_word": false,
323
+ "special": false
324
+ },
325
+ "50300": {
326
+ "content": "rọ",
327
+ "lstrip": false,
328
+ "normalized": true,
329
+ "rstrip": false,
330
+ "single_word": false,
331
+ "special": false
332
+ },
333
+ "50301": {
334
+ "content": " Ọ",
335
+ "lstrip": false,
336
+ "normalized": true,
337
+ "rstrip": false,
338
+ "single_word": false,
339
+ "special": false
340
+ },
341
+ "50302": {
342
+ "content": "ún",
343
+ "lstrip": false,
344
+ "normalized": true,
345
+ "rstrip": false,
346
+ "single_word": false,
347
+ "special": false
348
+ },
349
+ "50303": {
350
+ "content": "àn",
351
+ "lstrip": false,
352
+ "normalized": true,
353
+ "rstrip": false,
354
+ "single_word": false,
355
+ "special": false
356
+ },
357
+ "50304": {
358
+ "content": " sọ",
359
+ "lstrip": false,
360
+ "normalized": true,
361
+ "rstrip": false,
362
+ "single_word": false,
363
+ "special": false
364
+ },
365
+ "50305": {
366
+ "content": " tó",
367
+ "lstrip": false,
368
+ "normalized": true,
369
+ "rstrip": false,
370
+ "single_word": false,
371
+ "special": false
372
+ },
373
+ "50306": {
374
+ "content": "ṣe",
375
+ "lstrip": false,
376
+ "normalized": true,
377
+ "rstrip": false,
378
+ "single_word": false,
379
+ "special": false
380
+ },
381
+ "50307": {
382
+ "content": " tí",
383
+ "lstrip": false,
384
+ "normalized": true,
385
+ "rstrip": false,
386
+ "single_word": false,
387
+ "special": false
388
+ },
389
+ "50308": {
390
+ "content": " pẹ",
391
+ "lstrip": false,
392
+ "normalized": true,
393
+ "rstrip": false,
394
+ "single_word": false,
395
+ "special": false
396
+ },
397
+ "50309": {
398
+ "content": " àwọn",
399
+ "lstrip": false,
400
+ "normalized": true,
401
+ "rstrip": false,
402
+ "single_word": false,
403
+ "special": false
404
+ },
405
+ "50310": {
406
+ "content": "rí",
407
+ "lstrip": false,
408
+ "normalized": true,
409
+ "rstrip": false,
410
+ "single_word": false,
411
+ "special": false
412
+ },
413
+ "50311": {
414
+ "content": " ohun",
415
+ "lstrip": false,
416
+ "normalized": true,
417
+ "rstrip": false,
418
+ "single_word": false,
419
+ "special": false
420
+ },
421
+ "50312": {
422
+ "content": " ń",
423
+ "lstrip": false,
424
+ "normalized": true,
425
+ "rstrip": false,
426
+ "single_word": false,
427
+ "special": false
428
+ },
429
+ "50313": {
430
+ "content": " ò",
431
+ "lstrip": false,
432
+ "normalized": true,
433
+ "rstrip": false,
434
+ "single_word": false,
435
+ "special": false
436
+ },
437
+ "50314": {
438
+ "content": "rẹ",
439
+ "lstrip": false,
440
+ "normalized": true,
441
+ "rstrip": false,
442
+ "single_word": false,
443
+ "special": false
444
+ },
445
+ "50315": {
446
+ "content": " pẹlu",
447
+ "lstrip": false,
448
+ "normalized": true,
449
+ "rstrip": false,
450
+ "single_word": false,
451
+ "special": false
452
+ },
453
+ "50316": {
454
+ "content": " lẹ",
455
+ "lstrip": false,
456
+ "normalized": true,
457
+ "rstrip": false,
458
+ "single_word": false,
459
+ "special": false
460
+ },
461
+ "50317": {
462
+ "content": " pé",
463
+ "lstrip": false,
464
+ "normalized": true,
465
+ "rstrip": false,
466
+ "single_word": false,
467
+ "special": false
468
+ },
469
+ "50318": {
470
+ "content": " gba",
471
+ "lstrip": false,
472
+ "normalized": true,
473
+ "rstrip": false,
474
+ "single_word": false,
475
+ "special": false
476
+ },
477
+ "50319": {
478
+ "content": " ile",
479
+ "lstrip": false,
480
+ "normalized": true,
481
+ "rstrip": false,
482
+ "single_word": false,
483
+ "special": false
484
+ },
485
+ "50320": {
486
+ "content": " ninu",
487
+ "lstrip": false,
488
+ "normalized": true,
489
+ "rstrip": false,
490
+ "single_word": false,
491
+ "special": false
492
+ },
493
+ "50321": {
494
+ "content": " lori",
495
+ "lstrip": false,
496
+ "normalized": true,
497
+ "rstrip": false,
498
+ "single_word": false,
499
+ "special": false
500
+ },
501
+ "50322": {
502
+ "content": " gbo",
503
+ "lstrip": false,
504
+ "normalized": true,
505
+ "rstrip": false,
506
+ "single_word": false,
507
+ "special": false
508
+ },
509
+ "50323": {
510
+ "content": "sẹ",
511
+ "lstrip": false,
512
+ "normalized": true,
513
+ "rstrip": false,
514
+ "single_word": false,
515
+ "special": false
516
+ },
517
+ "50324": {
518
+ "content": "tọ",
519
+ "lstrip": false,
520
+ "normalized": true,
521
+ "rstrip": false,
522
+ "single_word": false,
523
+ "special": false
524
+ },
525
+ "50325": {
526
+ "content": " è",
527
+ "lstrip": false,
528
+ "normalized": true,
529
+ "rstrip": false,
530
+ "single_word": false,
531
+ "special": false
532
+ },
533
+ "50326": {
534
+ "content": " Awọn",
535
+ "lstrip": false,
536
+ "normalized": true,
537
+ "rstrip": false,
538
+ "single_word": false,
539
+ "special": false
540
+ },
541
+ "50327": {
542
+ "content": " yoo",
543
+ "lstrip": false,
544
+ "normalized": true,
545
+ "rstrip": false,
546
+ "single_word": false,
547
+ "special": false
548
+ },
549
+ "50328": {
550
+ "content": "yin",
551
+ "lstrip": false,
552
+ "normalized": true,
553
+ "rstrip": false,
554
+ "single_word": false,
555
+ "special": false
556
+ },
557
+ "50329": {
558
+ "content": "áa",
559
+ "lstrip": false,
560
+ "normalized": true,
561
+ "rstrip": false,
562
+ "single_word": false,
563
+ "special": false
564
+ },
565
+ "50330": {
566
+ "content": " ọmọ",
567
+ "lstrip": false,
568
+ "normalized": true,
569
+ "rstrip": false,
570
+ "single_word": false,
571
+ "special": false
572
+ },
573
+ "50331": {
574
+ "content": " gbogbo",
575
+ "lstrip": false,
576
+ "normalized": true,
577
+ "rstrip": false,
578
+ "single_word": false,
579
+ "special": false
580
+ },
581
+ "50332": {
582
+ "content": " lá",
583
+ "lstrip": false,
584
+ "normalized": true,
585
+ "rstrip": false,
586
+ "single_word": false,
587
+ "special": false
588
+ },
589
+ "50333": {
590
+ "content": " nipa",
591
+ "lstrip": false,
592
+ "normalized": true,
593
+ "rstrip": false,
594
+ "single_word": false,
595
+ "special": false
596
+ },
597
+ "50334": {
598
+ "content": " mọ",
599
+ "lstrip": false,
600
+ "normalized": true,
601
+ "rstrip": false,
602
+ "single_word": false,
603
+ "special": false
604
+ },
605
+ "50335": {
606
+ "content": " ló",
607
+ "lstrip": false,
608
+ "normalized": true,
609
+ "rstrip": false,
610
+ "single_word": false,
611
+ "special": false
612
+ },
613
+ "50336": {
614
+ "content": "gbẹ",
615
+ "lstrip": false,
616
+ "normalized": true,
617
+ "rstrip": false,
618
+ "single_word": false,
619
+ "special": false
620
+ },
621
+ "50337": {
622
+ "content": " sí",
623
+ "lstrip": false,
624
+ "normalized": true,
625
+ "rstrip": false,
626
+ "single_word": false,
627
+ "special": false
628
+ },
629
+ "50338": {
630
+ "content": "dun",
631
+ "lstrip": false,
632
+ "normalized": true,
633
+ "rstrip": false,
634
+ "single_word": false,
635
+ "special": false
636
+ },
637
+ "50339": {
638
+ "content": " ara",
639
+ "lstrip": false,
640
+ "normalized": true,
641
+ "rstrip": false,
642
+ "single_word": false,
643
+ "special": false
644
+ },
645
+ "50340": {
646
+ "content": "bẹ",
647
+ "lstrip": false,
648
+ "normalized": true,
649
+ "rstrip": false,
650
+ "single_word": false,
651
+ "special": false
652
+ },
653
+ "50341": {
654
+ "content": " yii",
655
+ "lstrip": false,
656
+ "normalized": true,
657
+ "rstrip": false,
658
+ "single_word": false,
659
+ "special": false
660
+ },
661
+ "50342": {
662
+ "content": "”",
663
+ "lstrip": false,
664
+ "normalized": true,
665
+ "rstrip": false,
666
+ "single_word": false,
667
+ "special": false
668
+ },
669
+ "50343": {
670
+ "content": " mẹ",
671
+ "lstrip": false,
672
+ "normalized": true,
673
+ "rstrip": false,
674
+ "single_word": false,
675
+ "special": false
676
+ },
677
+ "50344": {
678
+ "content": " máa",
679
+ "lstrip": false,
680
+ "normalized": true,
681
+ "rstrip": false,
682
+ "single_word": false,
683
+ "special": false
684
+ },
685
+ "50345": {
686
+ "content": " iṣẹ",
687
+ "lstrip": false,
688
+ "normalized": true,
689
+ "rstrip": false,
690
+ "single_word": false,
691
+ "special": false
692
+ },
693
+ "50346": {
694
+ "content": "Ẹ",
695
+ "lstrip": false,
696
+ "normalized": true,
697
+ "rstrip": false,
698
+ "single_word": false,
699
+ "special": false
700
+ },
701
+ "50347": {
702
+ "content": "áà",
703
+ "lstrip": false,
704
+ "normalized": true,
705
+ "rstrip": false,
706
+ "single_word": false,
707
+ "special": false
708
+ },
709
+ "50348": {
710
+ "content": " ó",
711
+ "lstrip": false,
712
+ "normalized": true,
713
+ "rstrip": false,
714
+ "single_word": false,
715
+ "special": false
716
+ },
717
+ "50349": {
718
+ "content": "yẹ",
719
+ "lstrip": false,
720
+ "normalized": true,
721
+ "rstrip": false,
722
+ "single_word": false,
723
+ "special": false
724
+ },
725
+ "50350": {
726
+ "content": " ala",
727
+ "lstrip": false,
728
+ "normalized": true,
729
+ "rstrip": false,
730
+ "single_word": false,
731
+ "special": false
732
+ },
733
+ "50351": {
734
+ "content": " “",
735
+ "lstrip": false,
736
+ "normalized": true,
737
+ "rstrip": false,
738
+ "single_word": false,
739
+ "special": false
740
+ },
741
+ "50352": {
742
+ "content": "ìn",
743
+ "lstrip": false,
744
+ "normalized": true,
745
+ "rstrip": false,
746
+ "single_word": false,
747
+ "special": false
748
+ },
749
+ "50353": {
750
+ "content": "iṣẹ",
751
+ "lstrip": false,
752
+ "normalized": true,
753
+ "rstrip": false,
754
+ "single_word": false,
755
+ "special": false
756
+ },
757
+ "50354": {
758
+ "content": "Ṣ",
759
+ "lstrip": false,
760
+ "normalized": true,
761
+ "rstrip": false,
762
+ "single_word": false,
763
+ "special": false
764
+ },
765
+ "50355": {
766
+ "content": " kọ",
767
+ "lstrip": false,
768
+ "normalized": true,
769
+ "rstrip": false,
770
+ "single_word": false,
771
+ "special": false
772
+ },
773
+ "50356": {
774
+ "content": "gbà",
775
+ "lstrip": false,
776
+ "normalized": true,
777
+ "rstrip": false,
778
+ "single_word": false,
779
+ "special": false
780
+ },
781
+ "50357": {
782
+ "content": "ùn",
783
+ "lstrip": false,
784
+ "normalized": true,
785
+ "rstrip": false,
786
+ "single_word": false,
787
+ "special": false
788
+ },
789
+ "50358": {
790
+ "content": "tẹ",
791
+ "lstrip": false,
792
+ "normalized": true,
793
+ "rstrip": false,
794
+ "single_word": false,
795
+ "special": false
796
+ },
797
+ "50359": {
798
+ "content": "ọna",
799
+ "lstrip": false,
800
+ "normalized": true,
801
+ "rstrip": false,
802
+ "single_word": false,
803
+ "special": false
804
+ },
805
+ "50360": {
806
+ "content": " tabi",
807
+ "lstrip": false,
808
+ "normalized": true,
809
+ "rstrip": false,
810
+ "single_word": false,
811
+ "special": false
812
+ },
813
+ "50361": {
814
+ "content": " Ì",
815
+ "lstrip": false,
816
+ "normalized": true,
817
+ "rstrip": false,
818
+ "single_word": false,
819
+ "special": false
820
+ }
821
+ },
822
+ "bos_token": "</s>",
823
+ "clean_up_tokenization_spaces": true,
824
+ "eos_token": "</s>",
825
+ "errors": "replace",
826
+ "model_max_length": 1000000000000000019884624838656,
827
+ "pad_token": "<pad>",
828
+ "tokenizer_class": "GPT2Tokenizer",
829
+ "unk_token": "</s>"
830
+ }
adapted_models/cpt+it/prop-0.25/opt_100-add/100M-yor/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
adapted_models/cpt+it/prop-0.25/opt_100-replace/100M-amh/all_results.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"perplexity": 1.1106812210197259}
adapted_models/cpt+it/prop-0.25/opt_100-replace/100M-amh/config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/SAN/intelsys/llm/aszablew/UCL_FYP/final_cpt_models/prop-0.25/opt_100-replace/100M-amh",
3
+ "_remove_final_layer_norm": false,
4
+ "activation_dropout": 0.0,
5
+ "activation_function": "relu",
6
+ "architectures": [
7
+ "OPTForCausalLM"
8
+ ],
9
+ "attention_dropout": 0.0,
10
+ "bos_token_id": 2,
11
+ "do_layer_norm_before": true,
12
+ "dropout": 0.1,
13
+ "enable_bias": true,
14
+ "eos_token_id": 2,
15
+ "ffn_dim": 8192,
16
+ "hidden_size": 2048,
17
+ "init_std": 0.02,
18
+ "layer_norm_elementwise_affine": true,
19
+ "layerdrop": 0.0,
20
+ "max_position_embeddings": 2048,
21
+ "model_type": "opt",
22
+ "num_attention_heads": 32,
23
+ "num_hidden_layers": 24,
24
+ "pad_token_id": 1,
25
+ "prefix": "</s>",
26
+ "torch_dtype": "bfloat16",
27
+ "transformers_version": "4.39.3",
28
+ "use_cache": true,
29
+ "vocab_size": 50272,
30
+ "word_embed_proj_dim": 2048
31
+ }
adapted_models/cpt+it/prop-0.25/opt_100-replace/100M-amh/generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 2,
4
+ "eos_token_id": 2,
5
+ "pad_token_id": 1,
6
+ "transformers_version": "4.39.3"
7
+ }
adapted_models/cpt+it/prop-0.25/opt_100-replace/100M-amh/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
adapted_models/cpt+it/prop-0.25/opt_100-replace/100M-amh/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b988a79da74aaca3682d8db684095aa4e9986b042e6025f31ec38ac0c3ea3af9
3
+ size 2837562526
adapted_models/cpt+it/prop-0.25/opt_100-replace/100M-amh/special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "</s>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<pad>",
18
+ "lstrip": false,
19
+ "normalized": true,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "</s>",
25
+ "lstrip": false,
26
+ "normalized": true,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
adapted_models/cpt+it/prop-0.25/opt_100-replace/100M-amh/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
adapted_models/cpt+it/prop-0.25/opt_100-replace/100M-amh/tokenizer_config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "1": {
6
+ "content": "<pad>",
7
+ "lstrip": false,
8
+ "normalized": true,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "2": {
14
+ "content": "</s>",
15
+ "lstrip": false,
16
+ "normalized": true,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ }
21
+ },
22
+ "bos_token": "</s>",
23
+ "clean_up_tokenization_spaces": true,
24
+ "eos_token": "</s>",
25
+ "errors": "replace",
26
+ "model_max_length": 1000000000000000019884624838656,
27
+ "pad_token": "<pad>",
28
+ "tokenizer_class": "GPT2Tokenizer",
29
+ "unk_token": "</s>"
30
+ }
adapted_models/cpt+it/prop-0.25/opt_100-replace/100M-amh/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
adapted_models/cpt+it/prop-0.25/opt_100-replace/100M-hau/all_results.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"perplexity": 1.1799651161320075}