Upload tokenizer

Browse files

Files changed (5) hide show

README.md +199 -0
added_tokens.json +3 -0
tokenizer.json +145 -109
tokenizer_config.json +47 -3
vocab.json +1 -1

README.md ADDED Viewed

	@@ -0,0 +1,199 @@

+---
+library_name: transformers
+tags: []
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+This is the model card of a 🤗 transformers model that has been pushed on the Hub. This model card has been automatically generated.
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]

added_tokens.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+  "<|endoftext|>": 196
+}

tokenizer.json CHANGED Viewed

@@ -38,6 +38,15 @@
       "rstrip": false,
       "normalized": false,
       "special": true
     }
   ],
   "normalizer": {
@@ -116,61 +125,61 @@
       "ɑ̃": 48,
       "r": 49,
       "x": 50,
-      "y": 51,
-      "ɛ̃": 52,
       "a": 53,
-      "ʁ": 54,
-      "e": 55,
-      "ɔ̃": 56,
-      "u": 57,
-      "o": 58,
-      "œ̃": 59,
-      "ø": 60,
-      "œ": 61,
-      "oː": 62,
-      "yː": 63,
-      "ɲ": 64,
-      "aː": 65,
-      "ts": 66,
-      "eː": 67,
-      "ʀ": 68,
-      "ç": 69,
-      "ɐ": 70,
-      "ɛɪ": 71,
-      "ʏ": 72,
-      "ɛː": 73,
-      "pf": 74,
-      "øː": 75,
-      "ã": 76,
-      "ɔː": 77,
-      "ɾ": 78,
-      "β": 79,
-      "ʎ": 80,
-      "ɣ": 81,
-      "ʝ": 82,
-      "oɪ": 83,
-      "eʊ": 84,
-      "pː": 85,
-      "ɟ": 86,
-      "ʋ": 87,
-      "ɪː": 88,
-      "ɵ": 89,
-      "œy": 90,
-      "tʲ": 91,
-      "au̯": 92,
-      "˥˩": 93,
-      "ʂ": 94,
-      "ɻ̩": 95,
-      "˧˥": 96,
-      "ɤ": 97,
-      "kʰ": 98,
-      "˥": 99,
-      "ʈʂʰ": 100,
-      "ɕ": 101,
-      "ei̯": 102,
-      "pʰ": 103,
-      "ai̯": 104,
-      "ou̯": 105,
       "tɕ": 106,
       "ʈʂ": 107,
       "ɹ̩": 108,
@@ -179,61 +188,88 @@
       "ɻ": 111,
       "ɥ": 112,
       "tsʰ": 113,
-      "ɚ": 114,
-      "ɯː": 115,
-      "ɯ": 116,
-      "pʲ": 117,
-      "ɸ": 118,
-      "rʲ": 119,
-      "kʲ": 120,
-      "bʲ": 121,
-      "mʲ": 122,
-      "˧": 123,
-      "˨˩": 124,
-      "ei": 125,
-      "˩˧": 126,
-      "˨": 127,
-      "ɐi": 128,
-      "m̩": 129,
-      "ou": 130,
-      "aːĭ": 131,
-      "ɵy": 132,
-      "ɔːĭ": 133,
-      "ɐu": 134,
-      "iːŭ": 135,
-      "aːŭ": 136,
-      "œː": 137,
-      "uːĭ": 138,
-      "kː": 139,
-      "æi": 140,
-      "yi": 141,
-      "ɵː": 142,
-      "tː": 143,
-      "æː": 144,
-      "dʑ": 145,
-      "l̩": 146,
-      "ɒ": 147,
-      "ʌː": 148,
-      "ɜ": 149,
-      "ʔ": 150,
-      "s̺": 151,
-      "ts̻": 152,
-      "s̻": 153,
-      "c": 154,
-      "ts̺": 155,
-      "tsː": 156,
-      "ɟː": 157,
-      "t̠ʃː": 158,
-      "ɡː": 159,
-      "ɑː": 160,
-      "dː": 161,
-      "cː": 162,
-      "bː": 163,
-      "ɫ": 164,
-      "ʊː": 165,
-      "q": 166,
-      "øy": 167,
-      "χ": 168
     },
     "unk_token": "UNK"
   }

       "rstrip": false,
       "normalized": false,
       "special": true
+    },
+    {
+      "id": 196,
+      "content": "<|endoftext|>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
     }
   ],
   "normalizer": {
       "ɑ̃": 48,
       "r": 49,
       "x": 50,
+      "nʲ": 51,
+      "ɒ": 52,
       "a": 53,
+      "ɑː": 54,
+      "ɔː": 55,
+      "əʊ": 56,
+      "ɐ": 57,
+      "eə": 58,
+      "ʊə": 59,
+      "n̩": 60,
+      "aː": 61,
+      "y": 62,
+      "ɛ̃": 63,
+      "ʁ": 64,
+      "e": 65,
+      "ɔ̃": 66,
+      "u": 67,
+      "o": 68,
+      "œ̃": 69,
+      "ø": 70,
+      "œ": 71,
+      "oː": 72,
+      "yː": 73,
+      "ɲ": 74,
+      "ts": 75,
+      "eː": 76,
+      "ʀ": 77,
+      "ç": 78,
+      "ɛɪ": 79,
+      "ʏ": 80,
+      "ɛː": 81,
+      "pf": 82,
+      "øː": 83,
+      "ã": 84,
+      "ɾ": 85,
+      "β": 86,
+      "ʎ": 87,
+      "ɣ": 88,
+      "ʝ": 89,
+      "oɪ": 90,
+      "eʊ": 91,
+      "pː": 92,
+      "ɟ": 93,
+      "ʋ": 94,
+      "ɪː": 95,
+      "ɵ": 96,
+      "œy": 97,
+      "tʲ": 98,
+      "au̯": 99,
+      "ʂ": 100,
+      "ɤ": 101,
+      "kʰ": 102,
+      "ʈʂʰ": 103,
+      "ɕ": 104,
+      "pʰ": 105,
       "tɕ": 106,
       "ʈʂ": 107,
       "ɹ̩": 108,
       "ɻ": 111,
       "ɥ": 112,
       "tsʰ": 113,
+      "ei̯": 114,
+      "ou̯": 115,
+      "ɻ̩": 116,
+      "ai̯": 117,
+      "ɯː": 118,
+      "ɯ": 119,
+      "pʲ": 120,
+      "ɸ": 121,
+      "rʲ": 122,
+      "kʲ": 123,
+      "bʲ": 124,
+      "mʲ": 125,
+      "kː": 126,
+      "æi": 127,
+      "yi": 128,
+      "ɵː": 129,
+      "tː": 130,
+      "æː": 131,
+      "dʑ": 132,
+      "l̩": 133,
+      "œː": 134,
+      "ʌː": 135,
+      "ɜ": 136,
+      "ʔ": 137,
+      "s̺": 138,
+      "ts̻": 139,
+      "s̻": 140,
+      "c": 141,
+      "ts̺": 142,
+      "tsː": 143,
+      "ɟː": 144,
+      "t̠ʃː": 145,
+      "ɡː": 146,
+      "dː": 147,
+      "cː": 148,
+      "bː": 149,
+      "dzː": 150,
+      "ɫ": 151,
+      "ʊː": 152,
+      "q": 153,
+      "øy": 154,
+      "χ": 155,
+      "i̯": 156,
+      "t̪": 157,
+      "d̪": 158,
+      "lʲ": 159,
+      "ɡʲ": 160,
+      "hʲ": 161,
+      "dʲ": 162,
+      "çʲ": 163,
+      "uə": 164,
+      "ŭ": 165,
+      "fʲ": 166,
+      "aɨ": 167,
+      "ɨ": 168,
+      "uɨ": 169,
+      "əɪ": 170,
+      "ɨː": 171,
+      "ɬ": 172,
+      "əɨ": 173,
+      "ɪu": 174,
+      "ʉ": 175,
+      "ʉː": 176,
+      "ɑɪ": 177,
+      "ʑ": 178,
+      "dz": 179,
+      "d̠ʒː": 180,
+      "ɐ̃": 181,
+      "ɛʊ": 182,
+      "ũ": 183,
+      "iʊ": 184,
+      "õ": 185,
+      "uɪ": 186,
+      "sʲ": 187,
+      "t̠ʃʲ": 188,
+      "ɔa": 189,
+      "ea": 190,
+      "iɪ": 191,
+      "tsʲ": 192,
+      "eo": 193,
+      "d̠ʒʲ": 194,
+      "ɾʲ": 195
     },
     "unk_token": "UNK"
   }

tokenizer_config.json CHANGED Viewed

@@ -1,8 +1,52 @@
 {
   "add_prefix_space": false,
-  "bos_token": "<|endoftext|>",
-  "eos_token": "<|endoftext|>",
   "model_max_length": 1000000000000000019884624838656,
   "tokenizer_class": "GPT2Tokenizer",
-  "unk_token": "<|endoftext|>"
 }

 {
   "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "UNK",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "PAD",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "BOS",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "EOS",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "196": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "BOS",
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "EOS",
   "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "PAD",
   "tokenizer_class": "GPT2Tokenizer",
+  "unk_token": "UNK"
 }

vocab.json CHANGED Viewed

@@ -1 +1 @@

- {"UNK":0,"PAD":1,"BOS":2,"EOS":3,"WORD_BOUNDARY":4,"UTT_BOUNDARY":5,"s":6,"iː":7,"ð":8,"ɛ":9,"ɹ":10,"z":11,"ʌ":12,"f":13,"eɪ":14,"w":15,"ɪ":16,"ɡ":17,"l":18,"æ":19,"ɑ":20,"h":21,"ə":22,"ʊ":23,"k":24,"p":25,"uː":26,"b":27,"i":28,"t":29,"aɪ":30,"θ":31,"ŋ":32,"j":33,"ɔ":34,"m":35,"ɔɪ":36,"n":37,"d":38,"oʊ":39,"aʊ":40,"v":41,"ɜː":42,"t̠ʃ":43,"d̠ʒ":44,"ʃ":45,"iə":46,"ʒ":47,"ɑ̃":48,"r":49,"x":50,"y":51,"ɛ̃":52,"a":53,"ʁ":54,"e":55,"ɔ̃":56,"u":57,"o":58,"œ̃":59,"ø":60,"œ":61,"oː":62,"yː":63,"ɲ":64,"aː":65,"ts":66,"eː":67,"ʀ":68,"ç":69,"ɐ":70,"ɛɪ":71,"ʏ":72,"ɛː":73,"pf":74,"øː":75,"ã":76,"ɔː":77,"ɾ":78,"β":79,"ʎ":80,"ɣ":81,"ʝ":82,"oɪ":83,"eʊ":84,"pː":85,"ɟ":86,"ʋ":87,"ɪː":88,"ɵ":89,"œy":90,"tʲ":91,"~~au̯~~":92,"˥˩":93,"ʂ":94,"ɻ̩":95,"˧˥":96,"ɤ":97,"kʰ":98,"˥":99,"~~ʈʂʰ~~":100,"ɕ":101,"~~ei̯~~":102,"pʰ":103,"~~ai̯~~":104,"~~ou̯~~":105,"tɕ":106,"ʈʂ":107,"ɹ̩":108,"tɕʰ":109,"tʰ":110,"ɻ":111,"ɥ":112,"tsʰ":113,"ɚ":114,"ɯː":115,"ɯ":116,"pʲ":117,"ɸ":118,"rʲ":119,"kʲ":120,"bʲ":121,"mʲ":122,"˧":123,"˨˩":124,"ei":125,"˩˧":126,"˨":127,"ɐi":128,"m̩":129,"ou":130,"~~aːĭ~~":131,"ɵy":132,"~~ɔːĭ~~":133,"ɐu":134,"~~iːŭ~~":135,"~~aːŭ~~":136,"œː":137,"~~uːĭ~~":138,"kː":139,"æi":140,"yi":141,"ɵː":142,"tː":143,"æː":144,"dʑ":145,"l̩":146,"ɒ":147,"ʌː":148,"ɜ":149,"ʔ":150,"s̺":151,"~~ts̻~~":152,"s̻":153,"c":154,"~~ts̺~~":155,"~~tsː~~":156,"ɟː":157,"~~t̠ʃː~~":158,"ɡː":159,"ɑː":160,"dː":161,"cː":162,"bː":163,"ɫ":164,"ʊː":165,"q":166,"øy":167,"χ":168}

+ {"UNK":0,"PAD":1,"BOS":2,"EOS":3,"WORD_BOUNDARY":4,"UTT_BOUNDARY":5,"s":6,"iː":7,"ð":8,"ɛ":9,"ɹ":10,"z":11,"ʌ":12,"f":13,"eɪ":14,"w":15,"ɪ":16,"ɡ":17,"l":18,"æ":19,"ɑ":20,"h":21,"ə":22,"ʊ":23,"k":24,"p":25,"uː":26,"b":27,"i":28,"t":29,"aɪ":30,"θ":31,"ŋ":32,"j":33,"ɔ":34,"m":35,"ɔɪ":36,"n":37,"d":38,"oʊ":39,"aʊ":40,"v":41,"ɜː":42,"t̠ʃ":43,"d̠ʒ":44,"ʃ":45,"iə":46,"ʒ":47,"ɑ̃":48,"r":49,"x":50,"nʲ":51,"ɒ":52,"a":53,"ɑː":54,"ɔː":55,"əʊ":56,"ɐ":57,"eə":58,"ʊə":59,"n̩":60,"aː":61,"y":62,"ɛ̃":63,"ʁ":64,"e":65,"ɔ̃":66,"u":67,"o":68,"œ̃":69,"ø":70,"œ":71,"oː":72,"yː":73,"ɲ":74,"ts":75,"eː":76,"ʀ":77,"ç":78,"ɛɪ":79,"ʏ":80,"ɛː":81,"pf":82,"øː":83,"ã":84,"ɾ":85,"β":86,"ʎ":87,"ɣ":88,"ʝ":89,"oɪ":90,"eʊ":91,"pː":92,"ɟ":93,"ʋ":94,"ɪː":95,"ɵ":96,"œy":97,"tʲ":98,"au̯":99,"ʂ":100,"ɤ":101,"kʰ":102,"ʈʂʰ":103,"ɕ":104,"pʰ":105,"tɕ":106,"ʈʂ":107,"ɹ̩":108,"tɕʰ":109,"tʰ":110,"ɻ":111,"ɥ":112,"tsʰ":113,"ei̯":114,"ou̯":115,"ɻ̩":116,"ai̯":117,"ɯː":118,"ɯ":119,"pʲ":120,"ɸ":121,"rʲ":122,"kʲ":123,"bʲ":124,"mʲ":125,"kː":126,"æi":127,"yi":128,"ɵː":129,"tː":130,"æː":131,"dʑ":132,"l̩":133,"œː":134,"ʌː":135,"ɜ":136,"ʔ":137,"s̺":138,"ts̻":139,"s̻":140,"c":141,"ts̺":142,"tsː":143,"ɟː":144,"t̠ʃː":145,"ɡː":146,"dː":147,"cː":148,"bː":149,"dzː":150,"ɫ":151,"ʊː":152,"q":153,"øy":154,"χ":155,"i̯":156,"t̪":157,"d̪":158,"lʲ":159,"ɡʲ":160,"hʲ":161,"dʲ":162,"çʲ":163,"uə":164,"ŭ":165,"fʲ":166,"aɨ":167,"ɨ":168,"uɨ":169,"əɪ":170,"ɨː":171,"ɬ":172,"əɨ":173,"ɪu":174,"ʉ":175,"ʉː":176,"ɑɪ":177,"ʑ":178,"dz":179,"d̠ʒː":180,"ɐ̃":181,"ɛʊ":182,"ũ":183,"iʊ":184,"õ":185,"uɪ":186,"sʲ":187,"t̠ʃʲ":188,"ɔa":189,"ea":190,"iɪ":191,"tsʲ":192,"eo":193,"d̠ʒʲ":194,"ɾʲ":195}