Upload 13 files
Browse files- 1_Pooling/config.json +10 -0
- README.md +213 -0
- config.json +32 -0
- config_sentence_transformers.json +10 -0
- config_setfit.json +4 -0
- model.safetensors +3 -0
- model_head.pkl +3 -0
- modules.json +20 -0
- sentence_bert_config.json +4 -0
- special_tokens_map.json +37 -0
- tokenizer.json +0 -0
- tokenizer_config.json +57 -0
- vocab.txt +0 -0
1_Pooling/config.json
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"word_embedding_dimension": 768,
|
3 |
+
"pooling_mode_cls_token": true,
|
4 |
+
"pooling_mode_mean_tokens": false,
|
5 |
+
"pooling_mode_max_tokens": false,
|
6 |
+
"pooling_mode_mean_sqrt_len_tokens": false,
|
7 |
+
"pooling_mode_weightedmean_tokens": false,
|
8 |
+
"pooling_mode_lasttoken": false,
|
9 |
+
"include_prompt": true
|
10 |
+
}
|
README.md
ADDED
@@ -0,0 +1,213 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
library_name: setfit
|
3 |
+
tags:
|
4 |
+
- setfit
|
5 |
+
- sentence-transformers
|
6 |
+
- text-classification
|
7 |
+
- generated_from_setfit_trainer
|
8 |
+
base_model: avsolatorio/GIST-Embedding-v0
|
9 |
+
metrics:
|
10 |
+
- accuracy
|
11 |
+
widget:
|
12 |
+
- text: 'CON - Conversion - Socure Integration and Optimization: The project aims
|
13 |
+
to integrate and optimize the Socure identity verification system into the existing
|
14 |
+
platform. This includes updating the user interface, improving the mobile verification
|
15 |
+
flow, and addressing issues related to the transition from the Berbix system to
|
16 |
+
Socure. The project also involves enhancing the admin experience, refining the
|
17 |
+
verification status polling route, and ensuring the system handles errors and
|
18 |
+
failures effectively.'
|
19 |
+
- text: 'Webhook Optimization and Error Handling Improvement: This project focuses
|
20 |
+
on enhancing the reliability and efficiency of webhook integrations within a software
|
21 |
+
platform. The main objectives include preventing webhooks from retrying upon failure,
|
22 |
+
addressing specific webhook issues, improving error message clarity, and reducing
|
23 |
+
error-related noise. Additionally, the project aims to ensure critical errors
|
24 |
+
are monitored effectively, and error displays in sync operations are corrected.
|
25 |
+
This initiative will lead to a more stable and user-friendly integration experience,
|
26 |
+
minimizing disruptions and improving overall system performance.'
|
27 |
+
- text: "Groups and Home: Build group feature with group chat, and important chat\
|
28 |
+
\ features (reactions, reply, repost). \nAdded calendaring and polling capability\
|
29 |
+
\ (related to scheduling). \nAbility to create a group, invite users to a group,\
|
30 |
+
\ manage the group (change settings, remove members, admin functions, invitations\
|
31 |
+
\ process, etc.). \nBuilt split bill, photo/video sharing. \nAlso built DM/Subchat\
|
32 |
+
\ capability (for users to communicate separate from the group). \nBuild engagement\
|
33 |
+
\ features such as typing indicators, \"who read message\" read status, delivered\
|
34 |
+
\ status and more. "
|
35 |
+
- text: "CX - Customer Experience - Enhanced Security and User Experience: This project\
|
36 |
+
\ focuses on enhancing the reliability and quality of the user experience and\
|
37 |
+
\ security across the platform. This involved updating the 2FA system to increase\
|
38 |
+
\ security for users with new reset request features and enhance the usability\
|
39 |
+
\ of them. Further work improved the KYC process which is central to the customer\
|
40 |
+
\ use of the product by streamlining KYC reminder and review queues and ensuring\
|
41 |
+
\ customer information is compliant and up-to-date with changing regulations.\
|
42 |
+
\ \n\nOther aspects focused on developing functionality for managing documents\
|
43 |
+
\ securely. The system will also provide secure document transfer and a dashboard\
|
44 |
+
\ for both users and admins. It ensures appropriate audit logging and includes\
|
45 |
+
\ the development of routes, handlers, and data models for efficient workflow.\
|
46 |
+
\ Finally the project improved search and feedback functionality in the help center\
|
47 |
+
\ to improve reliability for users."
|
48 |
+
- text: 'AI Content Generation Engine: Company needed to invent a programmatically
|
49 |
+
usable, reliable way to generate learning content for professionals. No existing
|
50 |
+
solutions satisfy the requirements of being able to ask a question, and repeatedly
|
51 |
+
product reliable content tailored towards a target audience.'
|
52 |
+
pipeline_tag: text-classification
|
53 |
+
inference: true
|
54 |
+
---
|
55 |
+
|
56 |
+
# SetFit with avsolatorio/GIST-Embedding-v0
|
57 |
+
|
58 |
+
This is a [SetFit](https://github.com/huggingface/setfit) model that can be used for Text Classification. This SetFit model uses [avsolatorio/GIST-Embedding-v0](https://huggingface.co/avsolatorio/GIST-Embedding-v0) as the Sentence Transformer embedding model. A [LogisticRegression](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html) instance is used for classification.
|
59 |
+
|
60 |
+
The model has been trained using an efficient few-shot learning technique that involves:
|
61 |
+
|
62 |
+
1. Fine-tuning a [Sentence Transformer](https://www.sbert.net) with contrastive learning.
|
63 |
+
2. Training a classification head with features from the fine-tuned Sentence Transformer.
|
64 |
+
|
65 |
+
## Model Details
|
66 |
+
|
67 |
+
### Model Description
|
68 |
+
- **Model Type:** SetFit
|
69 |
+
- **Sentence Transformer body:** [avsolatorio/GIST-Embedding-v0](https://huggingface.co/avsolatorio/GIST-Embedding-v0)
|
70 |
+
- **Classification head:** a [LogisticRegression](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html) instance
|
71 |
+
- **Maximum Sequence Length:** 512 tokens
|
72 |
+
- **Number of Classes:** 2 classes
|
73 |
+
<!-- - **Training Dataset:** [Unknown](https://huggingface.co/datasets/unknown) -->
|
74 |
+
<!-- - **Language:** Unknown -->
|
75 |
+
<!-- - **License:** Unknown -->
|
76 |
+
|
77 |
+
### Model Sources
|
78 |
+
|
79 |
+
- **Repository:** [SetFit on GitHub](https://github.com/huggingface/setfit)
|
80 |
+
- **Paper:** [Efficient Few-Shot Learning Without Prompts](https://arxiv.org/abs/2209.11055)
|
81 |
+
- **Blogpost:** [SetFit: Efficient Few-Shot Learning Without Prompts](https://huggingface.co/blog/setfit)
|
82 |
+
|
83 |
+
### Model Labels
|
84 |
+
| Label | Examples |
|
85 |
+
|:------|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
86 |
+
| 1 | <ul><li>'Tax System Revamp: Enhancing the tax filing experience by overhauling the system with new functionalities, including state-specific updates, direct deposit, and advanced error detection. Integrates cutting-edge features like data prefill, OCR for form uploads, and improved PDF management for a seamless, secure, and efficient filing process across devices.'</li><li>'CHREC - Choice Reconciliation - Financial Reconciliation System: This project involves the development of a system to reconcile financial transactions. The system will handle various types of transactions such as wires, checks, and ACH transfers. It will identify and resolve discrepancies in transaction amounts, ensuring the accuracy and integrity of financial data.'</li><li>'DPROIC Electronics: Creation of a composite focal plane array (CFPA) using multiple Digital Pixel Readout Integrated Circuits (DPROICs), capable of yielding the performance of a very large imaging chip but comprised of multiple readily available smaller chips. This CFPA design is an innovative custom product able to be substituted for large format ROICs using existing hardware.'</li></ul> |
|
87 |
+
| 0 | <ul><li>"Infrastructure and Data Environment Enhancement: This project focuses on enhancing the company's data processing and storage infrastructure across various environments (development, QA, stage, production). It involves creating and managing resources like Amazon ECR repositories, Kubernetes namespaces, and AWS S3 buckets to support various services such as Cerebrum, Reconciler, and Graphcast. Additionally, it includes setting up OpenSearch clusters for improved search capabilities, configuring access permissions, and ensuring seamless deployment and integration of services like neo4j, PostgreSQL databases, and FastAPI applications. The goal is to optimize data management, search functionality, and application performance, facilitating better risk analysis and compliance monitoring."</li><li>'eBay Seller Refurbishment Receive and Grade: A program that secret shops sellers that want to be part of the eBay Seller Refurbished program. Additionally items are re-listed on the eBay platform to be sold.'</li><li>'Quality Automation Resources MVP: This project aims to significantly improve the quality and efficiency of software testing processes. By expanding test scenario capabilities using advanced AI, enhancing educational content for automation, conducting performance/load and API testing, improving documentation, and implementing data collection with dashboards, the project seeks to provide a comprehensive upgrade to the current testing framework. This will enable more robust, efficient, and insightful testing practices, ensuring higher software quality for users.'</li></ul> |
|
88 |
+
|
89 |
+
## Uses
|
90 |
+
|
91 |
+
### Direct Use for Inference
|
92 |
+
|
93 |
+
First install the SetFit library:
|
94 |
+
|
95 |
+
```bash
|
96 |
+
pip install setfit
|
97 |
+
```
|
98 |
+
|
99 |
+
Then you can load this model and run inference.
|
100 |
+
|
101 |
+
```python
|
102 |
+
from setfit import SetFitModel
|
103 |
+
|
104 |
+
# Download from the 🤗 Hub
|
105 |
+
model = SetFitModel.from_pretrained("setfit_model_id")
|
106 |
+
# Run inference
|
107 |
+
preds = model("AI Content Generation Engine: Company needed to invent a programmatically usable, reliable way to generate learning content for professionals. No existing solutions satisfy the requirements of being able to ask a question, and repeatedly product reliable content tailored towards a target audience.")
|
108 |
+
```
|
109 |
+
|
110 |
+
<!--
|
111 |
+
### Downstream Use
|
112 |
+
|
113 |
+
*List how someone could finetune this model on their own dataset.*
|
114 |
+
-->
|
115 |
+
|
116 |
+
<!--
|
117 |
+
### Out-of-Scope Use
|
118 |
+
|
119 |
+
*List how the model may foreseeably be misused and address what users ought not to do with the model.*
|
120 |
+
-->
|
121 |
+
|
122 |
+
<!--
|
123 |
+
## Bias, Risks and Limitations
|
124 |
+
|
125 |
+
*What are the known or foreseeable issues stemming from this model? You could also flag here known failure cases or weaknesses of the model.*
|
126 |
+
-->
|
127 |
+
|
128 |
+
<!--
|
129 |
+
### Recommendations
|
130 |
+
|
131 |
+
*What are recommendations with respect to the foreseeable issues? For example, filtering explicit content.*
|
132 |
+
-->
|
133 |
+
|
134 |
+
## Training Details
|
135 |
+
|
136 |
+
### Training Set Metrics
|
137 |
+
| Training set | Min | Median | Max |
|
138 |
+
|:-------------|:----|:--------|:----|
|
139 |
+
| Word count | 18 | 75.5789 | 397 |
|
140 |
+
|
141 |
+
| Label | Training Sample Count |
|
142 |
+
|:------|:----------------------|
|
143 |
+
| 0 | 146 |
|
144 |
+
| 1 | 82 |
|
145 |
+
|
146 |
+
### Training Hyperparameters
|
147 |
+
- batch_size: (8, 8)
|
148 |
+
- num_epochs: (3, 3)
|
149 |
+
- max_steps: -1
|
150 |
+
- sampling_strategy: oversampling
|
151 |
+
- num_iterations: 20
|
152 |
+
- body_learning_rate: (2e-05, 1e-05)
|
153 |
+
- head_learning_rate: 0.01
|
154 |
+
- loss: CosineSimilarityLoss
|
155 |
+
- distance_metric: cosine_distance
|
156 |
+
- margin: 0.25
|
157 |
+
- end_to_end: False
|
158 |
+
- use_amp: True
|
159 |
+
- warmup_proportion: 0.1
|
160 |
+
- seed: 42
|
161 |
+
- eval_max_steps: -1
|
162 |
+
- load_best_model_at_end: False
|
163 |
+
|
164 |
+
### Training Results
|
165 |
+
| Epoch | Step | Training Loss | Validation Loss |
|
166 |
+
|:------:|:----:|:-------------:|:---------------:|
|
167 |
+
| 0.0009 | 1 | 0.2391 | - |
|
168 |
+
| 0.8772 | 1000 | 0.0011 | - |
|
169 |
+
| 1.7544 | 2000 | 0.0009 | - |
|
170 |
+
| 2.6316 | 3000 | 0.0008 | - |
|
171 |
+
|
172 |
+
### Framework Versions
|
173 |
+
- Python: 3.9.16
|
174 |
+
- SetFit: 1.0.3
|
175 |
+
- Sentence Transformers: 3.1.1
|
176 |
+
- Transformers: 4.39.0
|
177 |
+
- PyTorch: 2.4.1+cu121
|
178 |
+
- Datasets: 3.0.0
|
179 |
+
- Tokenizers: 0.15.2
|
180 |
+
|
181 |
+
## Citation
|
182 |
+
|
183 |
+
### BibTeX
|
184 |
+
```bibtex
|
185 |
+
@article{https://doi.org/10.48550/arxiv.2209.11055,
|
186 |
+
doi = {10.48550/ARXIV.2209.11055},
|
187 |
+
url = {https://arxiv.org/abs/2209.11055},
|
188 |
+
author = {Tunstall, Lewis and Reimers, Nils and Jo, Unso Eun Seo and Bates, Luke and Korat, Daniel and Wasserblat, Moshe and Pereg, Oren},
|
189 |
+
keywords = {Computation and Language (cs.CL), FOS: Computer and information sciences, FOS: Computer and information sciences},
|
190 |
+
title = {Efficient Few-Shot Learning Without Prompts},
|
191 |
+
publisher = {arXiv},
|
192 |
+
year = {2022},
|
193 |
+
copyright = {Creative Commons Attribution 4.0 International}
|
194 |
+
}
|
195 |
+
```
|
196 |
+
|
197 |
+
<!--
|
198 |
+
## Glossary
|
199 |
+
|
200 |
+
*Clearly define terms in order to be accessible across audiences.*
|
201 |
+
-->
|
202 |
+
|
203 |
+
<!--
|
204 |
+
## Model Card Authors
|
205 |
+
|
206 |
+
*Lists the people who create the model card, providing recognition and accountability for the detailed work that goes into its construction.*
|
207 |
+
-->
|
208 |
+
|
209 |
+
<!--
|
210 |
+
## Model Card Contact
|
211 |
+
|
212 |
+
*Provides a way for people who have updates to the Model Card, suggestions, or questions, to contact the Model Card authors.*
|
213 |
+
-->
|
config.json
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "avsolatorio/GIST-Embedding-v0",
|
3 |
+
"architectures": [
|
4 |
+
"BertModel"
|
5 |
+
],
|
6 |
+
"attention_probs_dropout_prob": 0.1,
|
7 |
+
"classifier_dropout": null,
|
8 |
+
"gradient_checkpointing": false,
|
9 |
+
"hidden_act": "gelu",
|
10 |
+
"hidden_dropout_prob": 0.1,
|
11 |
+
"hidden_size": 768,
|
12 |
+
"id2label": {
|
13 |
+
"0": "LABEL_0"
|
14 |
+
},
|
15 |
+
"initializer_range": 0.02,
|
16 |
+
"intermediate_size": 3072,
|
17 |
+
"label2id": {
|
18 |
+
"LABEL_0": 0
|
19 |
+
},
|
20 |
+
"layer_norm_eps": 1e-12,
|
21 |
+
"max_position_embeddings": 512,
|
22 |
+
"model_type": "bert",
|
23 |
+
"num_attention_heads": 12,
|
24 |
+
"num_hidden_layers": 12,
|
25 |
+
"pad_token_id": 0,
|
26 |
+
"position_embedding_type": "absolute",
|
27 |
+
"torch_dtype": "float32",
|
28 |
+
"transformers_version": "4.39.0",
|
29 |
+
"type_vocab_size": 2,
|
30 |
+
"use_cache": true,
|
31 |
+
"vocab_size": 30522
|
32 |
+
}
|
config_sentence_transformers.json
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"__version__": {
|
3 |
+
"sentence_transformers": "3.1.1",
|
4 |
+
"transformers": "4.39.0",
|
5 |
+
"pytorch": "2.4.1+cu121"
|
6 |
+
},
|
7 |
+
"prompts": {},
|
8 |
+
"default_prompt_name": null,
|
9 |
+
"similarity_fn_name": null
|
10 |
+
}
|
config_setfit.json
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"normalize_embeddings": false,
|
3 |
+
"labels": null
|
4 |
+
}
|
model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7a91757960e509d70b9b25139b498ccff152ffa3cb023488b01798d5e79f37c5
|
3 |
+
size 437951328
|
model_head.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1bee06e700569b723905a5a260e9a61f8b060cff290e90c383325bb7296d74a1
|
3 |
+
size 7007
|
modules.json
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"idx": 0,
|
4 |
+
"name": "0",
|
5 |
+
"path": "",
|
6 |
+
"type": "sentence_transformers.models.Transformer"
|
7 |
+
},
|
8 |
+
{
|
9 |
+
"idx": 1,
|
10 |
+
"name": "1",
|
11 |
+
"path": "1_Pooling",
|
12 |
+
"type": "sentence_transformers.models.Pooling"
|
13 |
+
},
|
14 |
+
{
|
15 |
+
"idx": 2,
|
16 |
+
"name": "2",
|
17 |
+
"path": "2_Normalize",
|
18 |
+
"type": "sentence_transformers.models.Normalize"
|
19 |
+
}
|
20 |
+
]
|
sentence_bert_config.json
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"max_seq_length": 512,
|
3 |
+
"do_lower_case": true
|
4 |
+
}
|
special_tokens_map.json
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cls_token": {
|
3 |
+
"content": "[CLS]",
|
4 |
+
"lstrip": false,
|
5 |
+
"normalized": false,
|
6 |
+
"rstrip": false,
|
7 |
+
"single_word": false
|
8 |
+
},
|
9 |
+
"mask_token": {
|
10 |
+
"content": "[MASK]",
|
11 |
+
"lstrip": false,
|
12 |
+
"normalized": false,
|
13 |
+
"rstrip": false,
|
14 |
+
"single_word": false
|
15 |
+
},
|
16 |
+
"pad_token": {
|
17 |
+
"content": "[PAD]",
|
18 |
+
"lstrip": false,
|
19 |
+
"normalized": false,
|
20 |
+
"rstrip": false,
|
21 |
+
"single_word": false
|
22 |
+
},
|
23 |
+
"sep_token": {
|
24 |
+
"content": "[SEP]",
|
25 |
+
"lstrip": false,
|
26 |
+
"normalized": false,
|
27 |
+
"rstrip": false,
|
28 |
+
"single_word": false
|
29 |
+
},
|
30 |
+
"unk_token": {
|
31 |
+
"content": "[UNK]",
|
32 |
+
"lstrip": false,
|
33 |
+
"normalized": false,
|
34 |
+
"rstrip": false,
|
35 |
+
"single_word": false
|
36 |
+
}
|
37 |
+
}
|
tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
tokenizer_config.json
ADDED
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"added_tokens_decoder": {
|
3 |
+
"0": {
|
4 |
+
"content": "[PAD]",
|
5 |
+
"lstrip": false,
|
6 |
+
"normalized": false,
|
7 |
+
"rstrip": false,
|
8 |
+
"single_word": false,
|
9 |
+
"special": true
|
10 |
+
},
|
11 |
+
"100": {
|
12 |
+
"content": "[UNK]",
|
13 |
+
"lstrip": false,
|
14 |
+
"normalized": false,
|
15 |
+
"rstrip": false,
|
16 |
+
"single_word": false,
|
17 |
+
"special": true
|
18 |
+
},
|
19 |
+
"101": {
|
20 |
+
"content": "[CLS]",
|
21 |
+
"lstrip": false,
|
22 |
+
"normalized": false,
|
23 |
+
"rstrip": false,
|
24 |
+
"single_word": false,
|
25 |
+
"special": true
|
26 |
+
},
|
27 |
+
"102": {
|
28 |
+
"content": "[SEP]",
|
29 |
+
"lstrip": false,
|
30 |
+
"normalized": false,
|
31 |
+
"rstrip": false,
|
32 |
+
"single_word": false,
|
33 |
+
"special": true
|
34 |
+
},
|
35 |
+
"103": {
|
36 |
+
"content": "[MASK]",
|
37 |
+
"lstrip": false,
|
38 |
+
"normalized": false,
|
39 |
+
"rstrip": false,
|
40 |
+
"single_word": false,
|
41 |
+
"special": true
|
42 |
+
}
|
43 |
+
},
|
44 |
+
"clean_up_tokenization_spaces": true,
|
45 |
+
"cls_token": "[CLS]",
|
46 |
+
"do_basic_tokenize": true,
|
47 |
+
"do_lower_case": true,
|
48 |
+
"mask_token": "[MASK]",
|
49 |
+
"model_max_length": 512,
|
50 |
+
"never_split": null,
|
51 |
+
"pad_token": "[PAD]",
|
52 |
+
"sep_token": "[SEP]",
|
53 |
+
"strip_accents": null,
|
54 |
+
"tokenize_chinese_chars": true,
|
55 |
+
"tokenizer_class": "BertTokenizer",
|
56 |
+
"unk_token": "[UNK]"
|
57 |
+
}
|
vocab.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|