mwz commited on
Commit
1fbb660
1 Parent(s): 8191c74

Upload tokenizer

Browse files
Files changed (4) hide show
  1. added_tokens.json +1 -3
  2. special_tokens_map.json +16 -0
  3. tokenizer_config.json +0 -36
  4. vocab.json +81 -81
added_tokens.json CHANGED
@@ -1,6 +1,4 @@
1
  {
2
  "</s>": 87,
3
- "<s>": 86,
4
- "[PAD]": 85,
5
- "[UNK]": 84
6
  }
 
1
  {
2
  "</s>": 87,
3
+ "<s>": 86
 
 
4
  }
special_tokens_map.json CHANGED
@@ -1,4 +1,20 @@
1
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  "bos_token": "<s>",
3
  "eos_token": "</s>",
4
  "pad_token": "[PAD]",
 
1
  {
2
+ "additional_special_tokens": [
3
+ {
4
+ "content": "<s>",
5
+ "lstrip": false,
6
+ "normalized": true,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ },
10
+ {
11
+ "content": "</s>",
12
+ "lstrip": false,
13
+ "normalized": true,
14
+ "rstrip": false,
15
+ "single_word": false
16
+ }
17
+ ],
18
  "bos_token": "<s>",
19
  "eos_token": "</s>",
20
  "pad_token": "[PAD]",
tokenizer_config.json CHANGED
@@ -1,39 +1,4 @@
1
  {
2
- "added_tokens_decoder": {
3
- "84": {
4
- "content": "[UNK]",
5
- "lstrip": true,
6
- "normalized": false,
7
- "rstrip": true,
8
- "single_word": false,
9
- "special": false
10
- },
11
- "85": {
12
- "content": "[PAD]",
13
- "lstrip": true,
14
- "normalized": false,
15
- "rstrip": true,
16
- "single_word": false,
17
- "special": false
18
- },
19
- "86": {
20
- "content": "<s>",
21
- "lstrip": false,
22
- "normalized": false,
23
- "rstrip": false,
24
- "single_word": false,
25
- "special": true
26
- },
27
- "87": {
28
- "content": "</s>",
29
- "lstrip": false,
30
- "normalized": false,
31
- "rstrip": false,
32
- "single_word": false,
33
- "special": true
34
- }
35
- },
36
- "additional_special_tokens": [],
37
  "bos_token": "<s>",
38
  "clean_up_tokenization_spaces": true,
39
  "do_lower_case": false,
@@ -43,7 +8,6 @@
43
  "replace_word_delimiter_char": " ",
44
  "target_lang": null,
45
  "tokenizer_class": "Wav2Vec2CTCTokenizer",
46
- "tokenizer_file": null,
47
  "unk_token": "[UNK]",
48
  "word_delimiter_token": "|"
49
  }
 
1
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  "bos_token": "<s>",
3
  "clean_up_tokenization_spaces": true,
4
  "do_lower_case": false,
 
8
  "replace_word_delimiter_char": " ",
9
  "target_lang": null,
10
  "tokenizer_class": "Wav2Vec2CTCTokenizer",
 
11
  "unk_token": "[UNK]",
12
  "word_delimiter_token": "|"
13
  }
vocab.json CHANGED
@@ -1,88 +1,88 @@
1
  {
2
- "'": 10,
3
  "[PAD]": 85,
4
  "[UNK]": 84,
5
- "`": 22,
6
- "|": 1,
7
- "،": 55,
8
- "ؑ": 57,
9
- "ؓ": 72,
10
- "ؔ": 71,
11
- "؛": 50,
12
- "؟": 36,
13
  "ء": 26,
14
- "آ": 8,
15
- "أ": 4,
16
- "ؤ": 45,
17
- "ئ": 48,
18
- "ا": 39,
19
- "ب": 12,
20
- "ت": 67,
21
- "ث": 83,
22
- "ج": 78,
23
- "ح": 66,
24
- "خ": 74,
25
- "د": 65,
26
- "ذ": 5,
27
- "ر": 82,
28
- "ز": 60,
29
- "س": 21,
30
- "ش": 41,
31
- "ص": 42,
32
- "ض": 0,
33
- "ط": 20,
34
- "ظ": 49,
35
- "ع": 17,
36
- "غ": 81,
37
- "ف": 70,
38
- "ق": 46,
39
- "ك": 6,
40
- "ل": 28,
41
- "م": 25,
42
- "ن": 11,
43
- "ه": 34,
44
- "و": 61,
45
- "ى": 80,
46
- "ي": 52,
47
- "ً": 19,
48
- "َ": 9,
49
- "ُ": 15,
50
- "ِ": 31,
51
  "ّ": 35,
52
- "ْ": 27,
53
- "ٓ": 2,
54
- "ٔ": 23,
55
- "ٗ": 47,
56
- "ٰ": 16,
57
- "ٹ": 38,
58
- "پ": 69,
59
- "چ": 40,
60
- "ڈ": 54,
61
- "ڑ": 33,
62
- "ژ": 62,
63
- "ک": 64,
64
- "گ": 73,
65
- "ں": 43,
66
- "ھ": 75,
67
- "ہ": 59,
68
- "ۂ": 13,
69
- "ۃ": 56,
70
- "ی": 3,
71
- "ے": 51,
72
- "ۓ": 68,
73
- "۔": 7,
74
- "’": 79,
75
- "…": 76,
76
- "ﭨ": 53,
77
- "ﮭ": 24,
78
  "ﮯ": 63,
79
- "ﯾ": 29,
80
- "ﷲ": 14,
81
- "ﷺ": 77,
82
- "ﺗ": 32,
83
- "ﺘ": 30,
84
- "ﺩ": 37,
85
- "ﺲ": 58,
86
- "ﻧ": 44,
87
- "ﻮ": 18
88
  }
 
1
  {
2
+ "'": 70,
3
  "[PAD]": 85,
4
  "[UNK]": 84,
5
+ "`": 81,
6
+ "|": 33,
7
+ "،": 65,
8
+ "ؑ": 80,
9
+ "ؓ": 18,
10
+ "ؔ": 38,
11
+ "؛": 68,
12
+ "؟": 30,
13
  "ء": 26,
14
+ "آ": 21,
15
+ "أ": 58,
16
+ "ؤ": 29,
17
+ "ئ": 74,
18
+ "ا": 22,
19
+ "ب": 11,
20
+ "ت": 3,
21
+ "ث": 62,
22
+ "ج": 0,
23
+ "ح": 10,
24
+ "خ": 44,
25
+ "د": 53,
26
+ "ذ": 42,
27
+ "ر": 1,
28
+ "ز": 64,
29
+ "س": 14,
30
+ "ش": 7,
31
+ "ص": 46,
32
+ "ض": 66,
33
+ "ط": 51,
34
+ "ظ": 17,
35
+ "ع": 13,
36
+ "غ": 40,
37
+ "ف": 56,
38
+ "ق": 71,
39
+ "ك": 16,
40
+ "ل": 2,
41
+ "م": 34,
42
+ "ن": 24,
43
+ "ه": 37,
44
+ "و": 69,
45
+ "ى": 50,
46
+ "ي": 67,
47
+ "ً": 75,
48
+ "َ": 82,
49
+ "ُ": 6,
50
+ "ِ": 36,
51
  "ّ": 35,
52
+ "ْ": 39,
53
+ "ٓ": 20,
54
+ "ٔ": 78,
55
+ "ٗ": 4,
56
+ "ٰ": 43,
57
+ "ٹ": 41,
58
+ "پ": 57,
59
+ "چ": 59,
60
+ "ڈ": 45,
61
+ "ڑ": 73,
62
+ "ژ": 54,
63
+ "ک": 83,
64
+ "گ": 9,
65
+ "ں": 28,
66
+ "ھ": 47,
67
+ "ہ": 32,
68
+ "ۂ": 25,
69
+ "ۃ": 76,
70
+ "ی": 79,
71
+ "ے": 72,
72
+ "ۓ": 48,
73
+ "۔": 31,
74
+ "’": 61,
75
+ "…": 55,
76
+ "ﭨ": 60,
77
+ "ﮭ": 8,
78
  "ﮯ": 63,
79
+ "ﯾ": 52,
80
+ "ﷲ": 49,
81
+ "ﷺ": 15,
82
+ "ﺗ": 19,
83
+ "ﺘ": 23,
84
+ "ﺩ": 5,
85
+ "ﺲ": 27,
86
+ "ﻧ": 77,
87
+ "ﻮ": 12
88
  }