TeeA commited on
Commit
56ef319
1 Parent(s): 8892578
added_tokens.json CHANGED
@@ -1,11 +1,13 @@
1
  {
2
- "</s_cols>": 40033,
3
- "</s_rows>": 40031,
4
- "</s_text>": 40035,
5
- "</s_vichart>": 40037,
6
- "<s_cols>": 40032,
7
- "<s_rows>": 40030,
8
- "<s_text>": 40034,
9
- "<s_vichart>": 40036,
10
- "<sep/>": 40038
 
 
11
  }
 
1
  {
2
+ "</s_cols>": 57528,
3
+ "</s_rows>": 57526,
4
+ "</s_text>": 57530,
5
+ "</s_vichart>": 57532,
6
+ "<s_cols>": 57527,
7
+ "<s_iitcdip>": 57523,
8
+ "<s_rows>": 57525,
9
+ "<s_synthdog>": 57524,
10
+ "<s_text>": 57529,
11
+ "<s_vichart>": 57531,
12
+ "<sep/>": 57522
13
  }
preprocessor_config.json CHANGED
@@ -1,4 +1,22 @@
1
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  "do_align_long_axis": false,
3
  "do_normalize": true,
4
  "do_pad": true,
 
1
  {
2
+ "_valid_processor_keys": [
3
+ "images",
4
+ "do_resize",
5
+ "size",
6
+ "resample",
7
+ "do_thumbnail",
8
+ "do_align_long_axis",
9
+ "do_pad",
10
+ "random_padding",
11
+ "do_rescale",
12
+ "rescale_factor",
13
+ "do_normalize",
14
+ "image_mean",
15
+ "image_std",
16
+ "return_tensors",
17
+ "data_format",
18
+ "input_data_format"
19
+ ],
20
  "do_align_long_axis": false,
21
  "do_normalize": true,
22
  "do_pad": true,
sentencepiece.bpe.model CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cfc8146abe2a0488e9e2a0c56de7952f7c11ab059eca145a0a727afce0db2865
3
- size 5069051
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cb9e3dce4c326195d08fc3dd0f7e2eee1da8595c847bf4c1a9c78b7a82d47e2d
3
+ size 1296245
special_tokens_map.json CHANGED
@@ -1,4 +1,8 @@
1
  {
 
 
 
 
2
  "bos_token": {
3
  "content": "<s>",
4
  "lstrip": false,
 
1
  {
2
+ "additional_special_tokens": [
3
+ "<s_iitcdip>",
4
+ "<s_synthdog>"
5
+ ],
6
  "bos_token": {
7
  "content": "<s>",
8
  "lstrip": false,
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json CHANGED
@@ -32,7 +32,7 @@
32
  "single_word": false,
33
  "special": true
34
  },
35
- "40029": {
36
  "content": "<mask>",
37
  "lstrip": true,
38
  "normalized": true,
@@ -40,7 +40,31 @@
40
  "single_word": false,
41
  "special": true
42
  },
43
- "40030": {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  "content": "<s_rows>",
45
  "lstrip": false,
46
  "normalized": true,
@@ -48,7 +72,7 @@
48
  "single_word": false,
49
  "special": false
50
  },
51
- "40031": {
52
  "content": "</s_rows>",
53
  "lstrip": false,
54
  "normalized": true,
@@ -56,7 +80,7 @@
56
  "single_word": false,
57
  "special": false
58
  },
59
- "40032": {
60
  "content": "<s_cols>",
61
  "lstrip": false,
62
  "normalized": true,
@@ -64,7 +88,7 @@
64
  "single_word": false,
65
  "special": false
66
  },
67
- "40033": {
68
  "content": "</s_cols>",
69
  "lstrip": false,
70
  "normalized": true,
@@ -72,7 +96,7 @@
72
  "single_word": false,
73
  "special": false
74
  },
75
- "40034": {
76
  "content": "<s_text>",
77
  "lstrip": false,
78
  "normalized": true,
@@ -80,7 +104,7 @@
80
  "single_word": false,
81
  "special": false
82
  },
83
- "40035": {
84
  "content": "</s_text>",
85
  "lstrip": false,
86
  "normalized": true,
@@ -88,7 +112,7 @@
88
  "single_word": false,
89
  "special": false
90
  },
91
- "40036": {
92
  "content": "<s_vichart>",
93
  "lstrip": false,
94
  "normalized": true,
@@ -96,33 +120,29 @@
96
  "single_word": false,
97
  "special": false
98
  },
99
- "40037": {
100
  "content": "</s_vichart>",
101
  "lstrip": false,
102
  "normalized": true,
103
  "rstrip": false,
104
  "single_word": false,
105
  "special": false
106
- },
107
- "40038": {
108
- "content": "<sep/>",
109
- "lstrip": false,
110
- "normalized": true,
111
- "rstrip": false,
112
- "single_word": false,
113
- "special": false
114
  }
115
  },
 
 
 
 
116
  "bos_token": "<s>",
117
  "clean_up_tokenization_spaces": true,
118
  "cls_token": "<s>",
119
  "eos_token": "</s>",
120
  "mask_token": "<mask>",
121
- "model_max_length": 1024,
122
  "pad_token": "<pad>",
123
  "processor_class": "DonutProcessor",
124
  "sep_token": "</s>",
125
  "sp_model_kwargs": {},
126
- "tokenizer_class": "BartphoTokenizer",
127
  "unk_token": "<unk>"
128
  }
 
32
  "single_word": false,
33
  "special": true
34
  },
35
+ "57521": {
36
  "content": "<mask>",
37
  "lstrip": true,
38
  "normalized": true,
 
40
  "single_word": false,
41
  "special": true
42
  },
43
+ "57522": {
44
+ "content": "<sep/>",
45
+ "lstrip": false,
46
+ "normalized": true,
47
+ "rstrip": false,
48
+ "single_word": false,
49
+ "special": false
50
+ },
51
+ "57523": {
52
+ "content": "<s_iitcdip>",
53
+ "lstrip": false,
54
+ "normalized": false,
55
+ "rstrip": false,
56
+ "single_word": false,
57
+ "special": true
58
+ },
59
+ "57524": {
60
+ "content": "<s_synthdog>",
61
+ "lstrip": false,
62
+ "normalized": false,
63
+ "rstrip": false,
64
+ "single_word": false,
65
+ "special": true
66
+ },
67
+ "57525": {
68
  "content": "<s_rows>",
69
  "lstrip": false,
70
  "normalized": true,
 
72
  "single_word": false,
73
  "special": false
74
  },
75
+ "57526": {
76
  "content": "</s_rows>",
77
  "lstrip": false,
78
  "normalized": true,
 
80
  "single_word": false,
81
  "special": false
82
  },
83
+ "57527": {
84
  "content": "<s_cols>",
85
  "lstrip": false,
86
  "normalized": true,
 
88
  "single_word": false,
89
  "special": false
90
  },
91
+ "57528": {
92
  "content": "</s_cols>",
93
  "lstrip": false,
94
  "normalized": true,
 
96
  "single_word": false,
97
  "special": false
98
  },
99
+ "57529": {
100
  "content": "<s_text>",
101
  "lstrip": false,
102
  "normalized": true,
 
104
  "single_word": false,
105
  "special": false
106
  },
107
+ "57530": {
108
  "content": "</s_text>",
109
  "lstrip": false,
110
  "normalized": true,
 
112
  "single_word": false,
113
  "special": false
114
  },
115
+ "57531": {
116
  "content": "<s_vichart>",
117
  "lstrip": false,
118
  "normalized": true,
 
120
  "single_word": false,
121
  "special": false
122
  },
123
+ "57532": {
124
  "content": "</s_vichart>",
125
  "lstrip": false,
126
  "normalized": true,
127
  "rstrip": false,
128
  "single_word": false,
129
  "special": false
 
 
 
 
 
 
 
 
130
  }
131
  },
132
+ "additional_special_tokens": [
133
+ "<s_iitcdip>",
134
+ "<s_synthdog>"
135
+ ],
136
  "bos_token": "<s>",
137
  "clean_up_tokenization_spaces": true,
138
  "cls_token": "<s>",
139
  "eos_token": "</s>",
140
  "mask_token": "<mask>",
141
+ "model_max_length": 1000000000000000019884624838656,
142
  "pad_token": "<pad>",
143
  "processor_class": "DonutProcessor",
144
  "sep_token": "</s>",
145
  "sp_model_kwargs": {},
146
+ "tokenizer_class": "XLMRobertaTokenizer",
147
  "unk_token": "<unk>"
148
  }