First version of TeLVE!
Browse files# The TeLVE v1.0
![TeLVE v1.png](https://cdn-uploads.huggingface.co/production/uploads/63417787a7582111c3f50df8/XrMTQ_yPOlqQJkwGCt58D.png)
- .gitattributes +1 -0
- README.md +69 -3
- images/mugla.jpg +3 -0
- imagine.py +103 -0
- main.py +167 -0
- models/TeLVE_v1.0.pth +3 -0
- teLVE_logo.png +0 -0
- tokenizer/special_tokens_map.json +7 -0
- tokenizer/tokenizer.json +0 -0
- tokenizer/tokenizer_config.json +58 -0
- tokenizer/vocab.txt +0 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
images/mugla.jpg filter=lfs diff=lfs merge=lfs -text
|
README.md
CHANGED
@@ -1,3 +1,69 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# TeLVE: Turkish efficient Language Vision Engine 🧿
|
2 |
+
[![License: CC BY 4.0](https://img.shields.io/badge/License-CC%20BY%204.0-lightgrey.svg)](https://creativecommons.org/licenses/by/4.0/)
|
3 |
+
[![Models: v1.0](https://img.shields.io/badge/Models-v1.0-blue)](https://huggingface.co/outsu/TeLVE)
|
4 |
+
## First Turkish VLM ever!
|
5 |
+
|
6 |
+
TeLVE is the first Visual Language Model specifically designed for Turkish language understanding and image description generation. Built on Vision Transformer (ViT) and BERT pre-trained encoder architectures, it bridges the gap in Turkish visual-linguistic processing.
|
7 |
+
No module named 'imagine'
|
8 |
+
![TeLVE logo](<teLVE_logo.png>)
|
9 |
+
|
10 |
+
## Model Description
|
11 |
+
|
12 |
+
TeLVE combines:
|
13 |
+
- 🖼️ Vision Transformer (ViT-base-patch16-224)
|
14 |
+
- 📝 Turkish BERT (dbmdz/bert-base-turkish-cased)
|
15 |
+
- 🔄 Cross-attention mechanism for vision-language fusion
|
16 |
+
|
17 |
+
### Version Logs
|
18 |
+
- **TeLVE v1.0**: Trained on Unsplash Lite dataset
|
19 |
+
|
20 |
+
## Usage
|
21 |
+
|
22 |
+
The model can be used in two ways:
|
23 |
+
|
24 |
+
### Inference (imagine.py)
|
25 |
+
```python
|
26 |
+
# Generate captions for images
|
27 |
+
python imagine.py
|
28 |
+
```
|
29 |
+
This script:
|
30 |
+
- Loads a trained TeLVE model
|
31 |
+
- Takes images from `images` directory
|
32 |
+
- Generates Turkish captions for each image
|
33 |
+
- Outputs the results to console
|
34 |
+
|
35 |
+
### Training (main.py)
|
36 |
+
Users can train their own models with ViT and BERT encoders.
|
37 |
+
```python
|
38 |
+
# Train a new model
|
39 |
+
python main.py
|
40 |
+
```
|
41 |
+
|
42 |
+
This script:
|
43 |
+
- Loads and preprocesses image-caption pairs
|
44 |
+
- Initializes ViT and BERT encoders
|
45 |
+
- Trains the combined model
|
46 |
+
- Saves the model and tokenizer
|
47 |
+
|
48 |
+
|
49 |
+
## Performance
|
50 |
+
Performance scores will be evaluated.
|
51 |
+
<!--
|
52 |
+
| Model Version | Dataset | BLEU-4 | METEOR | CIDEr |
|
53 |
+
|--------------|---------|---------|---------|--------|
|
54 |
+
| TeLVE v1.0 | Unsplash | *TBD* | *TBD* | *TBD* |
|
55 |
+
| TeLVE v1.1 | Unsplash+Pexels | *TBD* | *TBD* | *TBD* |-->
|
56 |
+
|
57 |
+
## Citation
|
58 |
+
|
59 |
+
```bibtex
|
60 |
+
@software{telve2024,
|
61 |
+
author = {Öğüt Su Karagün},
|
62 |
+
title = {TeLVE: Turkish efficient Language Vision Engine},
|
63 |
+
year = {2024},
|
64 |
+
url = {https://huggingface.co/outsu/TeLVE}
|
65 |
+
}
|
66 |
+
```
|
67 |
+
|
68 |
+
## License
|
69 |
+
This work is licensed under a [Creative Commons Attribution 4.0 International License](http://creativecommons.org/licenses/by/4.0/).
|
images/mugla.jpg
ADDED
![]() |
Git LFS Details
|
imagine.py
ADDED
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
from torchvision import transforms
|
4 |
+
from transformers import ViTModel, BertTokenizerFast, BertConfig, BertLMHeadModel
|
5 |
+
from PIL import Image
|
6 |
+
import os
|
7 |
+
|
8 |
+
# Check if CUDA is available
|
9 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
10 |
+
print(f"Using device: {device}")
|
11 |
+
|
12 |
+
# Define constants
|
13 |
+
VIT_MODEL_NAME = "google/vit-base-patch16-224"
|
14 |
+
BERT_MODEL_NAME = "dbmdz/bert-base-turkish-cased"
|
15 |
+
MAX_LENGTH = 128
|
16 |
+
|
17 |
+
class ImageCaptioningModel(nn.Module):
|
18 |
+
def __init__(self, vit_model, bert_model):
|
19 |
+
super(ImageCaptioningModel, self).__init__()
|
20 |
+
self.vit = vit_model
|
21 |
+
self.bert = bert_model
|
22 |
+
self.linear = nn.Linear(self.vit.config.hidden_size, self.bert.config.hidden_size)
|
23 |
+
|
24 |
+
def forward(self, pixel_values, input_ids, attention_mask, labels=None):
|
25 |
+
image_features = self.vit(pixel_values).last_hidden_state
|
26 |
+
image_features = self.linear(image_features)
|
27 |
+
|
28 |
+
outputs = self.bert(input_ids=input_ids,
|
29 |
+
attention_mask=attention_mask,
|
30 |
+
encoder_hidden_states=image_features,
|
31 |
+
labels=labels,
|
32 |
+
return_dict=True)
|
33 |
+
|
34 |
+
return outputs.loss, outputs.logits
|
35 |
+
|
36 |
+
def load_model(model_path):
|
37 |
+
# Initialize the model components
|
38 |
+
vit_model = ViTModel.from_pretrained(VIT_MODEL_NAME)
|
39 |
+
bert_config = BertConfig.from_pretrained(BERT_MODEL_NAME)
|
40 |
+
bert_config.is_decoder = True
|
41 |
+
bert_config.add_cross_attention = True
|
42 |
+
bert_model = BertLMHeadModel.from_pretrained(BERT_MODEL_NAME, config=bert_config)
|
43 |
+
|
44 |
+
# Create the combined model
|
45 |
+
model = ImageCaptioningModel(vit_model, bert_model)
|
46 |
+
model.load_state_dict(torch.load(model_path, map_location=device))
|
47 |
+
model.to(device)
|
48 |
+
model.eval()
|
49 |
+
return model
|
50 |
+
|
51 |
+
def generate_caption(model, image_path, tokenizer):
|
52 |
+
# Prepare the image
|
53 |
+
transform = transforms.Compose([
|
54 |
+
transforms.Resize((224, 224)),
|
55 |
+
transforms.ToTensor(),
|
56 |
+
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
|
57 |
+
])
|
58 |
+
image = Image.open(image_path).convert('RGB')
|
59 |
+
image = transform(image).unsqueeze(0).to(device)
|
60 |
+
|
61 |
+
# Generate the caption
|
62 |
+
with torch.no_grad():
|
63 |
+
input_ids = torch.tensor([[tokenizer.cls_token_id]]).to(device)
|
64 |
+
attention_mask = torch.tensor([[1]]).to(device)
|
65 |
+
|
66 |
+
for _ in range(MAX_LENGTH):
|
67 |
+
_, logits = model(image, input_ids, attention_mask)
|
68 |
+
next_token = logits[:, -1, :].argmax(dim=-1)
|
69 |
+
|
70 |
+
if next_token.item() == tokenizer.sep_token_id:
|
71 |
+
break
|
72 |
+
|
73 |
+
input_ids = torch.cat([input_ids, next_token.unsqueeze(0)], dim=1)
|
74 |
+
attention_mask = torch.cat([attention_mask, torch.tensor([[1]]).to(device)], dim=1)
|
75 |
+
|
76 |
+
caption = tokenizer.decode(input_ids[0], skip_special_tokens=True)
|
77 |
+
return caption
|
78 |
+
|
79 |
+
def main():
|
80 |
+
model_path = "./models/TeLVE_v1.1.pth"
|
81 |
+
tokenizer_path = "./tokenizer"
|
82 |
+
|
83 |
+
# Check if the model and tokenizer exist
|
84 |
+
if not os.path.exists(model_path) or not os.path.exists(tokenizer_path):
|
85 |
+
print("Model or tokenizer not found. Please make sure you have trained the model and saved it correctly.")
|
86 |
+
return
|
87 |
+
|
88 |
+
# Load the model and tokenizer
|
89 |
+
model = load_model(model_path)
|
90 |
+
tokenizer = BertTokenizerFast.from_pretrained(tokenizer_path)
|
91 |
+
|
92 |
+
# Generate captions for images in a specified directory
|
93 |
+
image_dir = "./images" # Change this to the directory containing your test images
|
94 |
+
for image_file in os.listdir(image_dir):
|
95 |
+
if image_file.lower().endswith(('.png', '.jpg', '.jpeg')):
|
96 |
+
image_path = os.path.join(image_dir, image_file)
|
97 |
+
caption = generate_caption(model, image_path, tokenizer)
|
98 |
+
print(f"Image: {image_file}")
|
99 |
+
print(f"Generated Caption: {caption}")
|
100 |
+
print("---")
|
101 |
+
|
102 |
+
if __name__ == "__main__":
|
103 |
+
main()
|
main.py
ADDED
@@ -0,0 +1,167 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import torch
|
3 |
+
import torch.nn as nn
|
4 |
+
from torch.utils.data import Dataset, DataLoader
|
5 |
+
from torchvision import transforms
|
6 |
+
from transformers import ViTModel, BertTokenizerFast, BertConfig, BertLMHeadModel, AdamW
|
7 |
+
from PIL import Image, ImageFile
|
8 |
+
import pandas as pd
|
9 |
+
from tqdm import tqdm
|
10 |
+
|
11 |
+
# Increase the maximum image size limit to avoid DecompressionBombWarning
|
12 |
+
Image.MAX_IMAGE_PIXELS = None
|
13 |
+
# Allow loading truncated images
|
14 |
+
ImageFile.LOAD_TRUNCATED_IMAGES = True
|
15 |
+
|
16 |
+
# Check if CUDA is available
|
17 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
18 |
+
print(f"Using device: {device}")
|
19 |
+
|
20 |
+
# Define constants
|
21 |
+
VIT_MODEL_NAME = "google/vit-base-patch16-224"
|
22 |
+
BERT_MODEL_NAME = "dbmdz/bert-base-turkish-cased" # Using a Turkish BERT model
|
23 |
+
model = "TeLVE_v1.0.pth"
|
24 |
+
MAX_LENGTH = 128
|
25 |
+
BATCH_SIZE = 8
|
26 |
+
EPOCHS = 5
|
27 |
+
LEARNING_RATE = 2e-5
|
28 |
+
|
29 |
+
class ImageCaptioningDataset(Dataset):
|
30 |
+
def __init__(self, dataframe, img_dir, tokenizer):
|
31 |
+
self.dataframe = dataframe
|
32 |
+
self.img_dir = img_dir
|
33 |
+
self.tokenizer = tokenizer
|
34 |
+
self.transform = transforms.Compose([
|
35 |
+
transforms.Resize((224, 224)),
|
36 |
+
transforms.ToTensor(),
|
37 |
+
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
|
38 |
+
])
|
39 |
+
|
40 |
+
def __len__(self):
|
41 |
+
return len(self.dataframe)
|
42 |
+
|
43 |
+
def __getitem__(self, idx):
|
44 |
+
row = self.dataframe.iloc[idx]
|
45 |
+
img_path = os.path.join(self.img_dir, row['photo_id'] + ".jpg")
|
46 |
+
|
47 |
+
try:
|
48 |
+
image = Image.open(img_path).convert('RGB')
|
49 |
+
image = self.transform(image)
|
50 |
+
except (FileNotFoundError, IOError):
|
51 |
+
# Return None if the image is not found or cannot be opened
|
52 |
+
return None
|
53 |
+
|
54 |
+
caption = row['ai_description']
|
55 |
+
|
56 |
+
# Check if caption is a valid string
|
57 |
+
if not isinstance(caption, str):
|
58 |
+
return None # Skip the example if caption is not valid
|
59 |
+
|
60 |
+
encoding = self.tokenizer(
|
61 |
+
caption,
|
62 |
+
add_special_tokens=True,
|
63 |
+
max_length=MAX_LENGTH,
|
64 |
+
padding='max_length',
|
65 |
+
truncation=True,
|
66 |
+
return_attention_mask=True,
|
67 |
+
return_tensors='pt'
|
68 |
+
)
|
69 |
+
|
70 |
+
return {
|
71 |
+
'pixel_values': image,
|
72 |
+
'input_ids': encoding['input_ids'].squeeze(),
|
73 |
+
'attention_mask': encoding['attention_mask'].squeeze(),
|
74 |
+
'labels': encoding['input_ids'].squeeze() # Use input_ids as labels for calculating loss
|
75 |
+
}
|
76 |
+
|
77 |
+
|
78 |
+
class ImageCaptioningModel(nn.Module):
|
79 |
+
def __init__(self, vit_model, bert_model):
|
80 |
+
super(ImageCaptioningModel, self).__init__()
|
81 |
+
self.vit = vit_model
|
82 |
+
self.bert = bert_model
|
83 |
+
self.linear = nn.Linear(self.vit.config.hidden_size, self.bert.config.hidden_size)
|
84 |
+
|
85 |
+
def forward(self, pixel_values, input_ids, attention_mask, labels=None):
|
86 |
+
image_features = self.vit(pixel_values).last_hidden_state
|
87 |
+
image_features = self.linear(image_features)
|
88 |
+
|
89 |
+
outputs = self.bert(input_ids=input_ids,
|
90 |
+
attention_mask=attention_mask,
|
91 |
+
encoder_hidden_states=image_features,
|
92 |
+
labels=labels,
|
93 |
+
return_dict=True)
|
94 |
+
|
95 |
+
return outputs.loss, outputs.logits
|
96 |
+
|
97 |
+
def collate_fn(batch):
|
98 |
+
# Filter out None values (skipped images)
|
99 |
+
batch = list(filter(lambda x: x is not None, batch))
|
100 |
+
if len(batch) == 0:
|
101 |
+
return None
|
102 |
+
return {key: torch.stack([item[key] for item in batch]) for key in batch[0]}
|
103 |
+
|
104 |
+
def train_vlm_model():
|
105 |
+
# Load and preprocess the dataset
|
106 |
+
encodings = ['utf-8', 'iso-8859-9', 'windows-1254']
|
107 |
+
for encoding in encodings:
|
108 |
+
try:
|
109 |
+
df = pd.read_csv('./datasets/' + model + '.tsv000', sep='\t', encoding=encoding)
|
110 |
+
print(f"Successfully read the file with {encoding} encoding.")
|
111 |
+
break
|
112 |
+
except UnicodeDecodeError:
|
113 |
+
print(f"Failed to read with {encoding} encoding. Trying next...")
|
114 |
+
else:
|
115 |
+
raise ValueError("Could not read the file with any of the specified encodings.")
|
116 |
+
|
117 |
+
# Initialize the tokenizer
|
118 |
+
tokenizer = BertTokenizerFast.from_pretrained(BERT_MODEL_NAME)
|
119 |
+
|
120 |
+
# Create the dataset and dataloader
|
121 |
+
dataset = ImageCaptioningDataset(df, '../download/images', tokenizer)
|
122 |
+
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
|
123 |
+
|
124 |
+
# Initialize the model components
|
125 |
+
vit_model = ViTModel.from_pretrained(VIT_MODEL_NAME)
|
126 |
+
bert_config = BertConfig.from_pretrained(BERT_MODEL_NAME)
|
127 |
+
bert_config.is_decoder = True
|
128 |
+
bert_config.add_cross_attention = True
|
129 |
+
bert_model = BertLMHeadModel.from_pretrained(BERT_MODEL_NAME, config=bert_config)
|
130 |
+
|
131 |
+
# Create the combined model
|
132 |
+
model = ImageCaptioningModel(vit_model, bert_model)
|
133 |
+
model.to(device)
|
134 |
+
|
135 |
+
# Define optimizer
|
136 |
+
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
|
137 |
+
|
138 |
+
# Training loop
|
139 |
+
model.train()
|
140 |
+
for epoch in range(EPOCHS):
|
141 |
+
total_loss = 0
|
142 |
+
progress_bar = tqdm(dataloader, desc=f"Epoch {epoch+1}/{EPOCHS}")
|
143 |
+
for batch in progress_bar:
|
144 |
+
if batch is None:
|
145 |
+
continue
|
146 |
+
|
147 |
+
pixel_values = batch['pixel_values'].to(device)
|
148 |
+
input_ids = batch['input_ids'].to(device)
|
149 |
+
attention_mask = batch['attention_mask'].to(device)
|
150 |
+
labels = batch['labels'].to(device)
|
151 |
+
|
152 |
+
optimizer.zero_grad()
|
153 |
+
loss, _ = model(pixel_values, input_ids, attention_mask, labels)
|
154 |
+
loss.backward()
|
155 |
+
optimizer.step()
|
156 |
+
|
157 |
+
total_loss += loss.item()
|
158 |
+
progress_bar.set_postfix({'loss': loss.item()})
|
159 |
+
|
160 |
+
print(f"Epoch {epoch+1}/{EPOCHS}, Average Loss: {total_loss/len(dataloader)}")
|
161 |
+
|
162 |
+
# Save the model
|
163 |
+
torch.save(model.state_dict(), "./models/" + model)
|
164 |
+
tokenizer.save_pretrained("./tokenizer")
|
165 |
+
|
166 |
+
if __name__ == "__main__":
|
167 |
+
train_vlm_model()
|
models/TeLVE_v1.0.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c79764aa75a603efead82246db2078c4d2c07edbdf218ec8719f7817f5728c68
|
3 |
+
size 904212666
|
teLVE_logo.png
ADDED
![]() |
tokenizer/special_tokens_map.json
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cls_token": "[CLS]",
|
3 |
+
"mask_token": "[MASK]",
|
4 |
+
"pad_token": "[PAD]",
|
5 |
+
"sep_token": "[SEP]",
|
6 |
+
"unk_token": "[UNK]"
|
7 |
+
}
|
tokenizer/tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
tokenizer/tokenizer_config.json
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"added_tokens_decoder": {
|
3 |
+
"0": {
|
4 |
+
"content": "[PAD]",
|
5 |
+
"lstrip": false,
|
6 |
+
"normalized": false,
|
7 |
+
"rstrip": false,
|
8 |
+
"single_word": false,
|
9 |
+
"special": true
|
10 |
+
},
|
11 |
+
"1": {
|
12 |
+
"content": "[UNK]",
|
13 |
+
"lstrip": false,
|
14 |
+
"normalized": false,
|
15 |
+
"rstrip": false,
|
16 |
+
"single_word": false,
|
17 |
+
"special": true
|
18 |
+
},
|
19 |
+
"2": {
|
20 |
+
"content": "[CLS]",
|
21 |
+
"lstrip": false,
|
22 |
+
"normalized": false,
|
23 |
+
"rstrip": false,
|
24 |
+
"single_word": false,
|
25 |
+
"special": true
|
26 |
+
},
|
27 |
+
"3": {
|
28 |
+
"content": "[SEP]",
|
29 |
+
"lstrip": false,
|
30 |
+
"normalized": false,
|
31 |
+
"rstrip": false,
|
32 |
+
"single_word": false,
|
33 |
+
"special": true
|
34 |
+
},
|
35 |
+
"4": {
|
36 |
+
"content": "[MASK]",
|
37 |
+
"lstrip": false,
|
38 |
+
"normalized": false,
|
39 |
+
"rstrip": false,
|
40 |
+
"single_word": false,
|
41 |
+
"special": true
|
42 |
+
}
|
43 |
+
},
|
44 |
+
"clean_up_tokenization_spaces": true,
|
45 |
+
"cls_token": "[CLS]",
|
46 |
+
"do_basic_tokenize": true,
|
47 |
+
"do_lower_case": false,
|
48 |
+
"mask_token": "[MASK]",
|
49 |
+
"max_len": 512,
|
50 |
+
"model_max_length": 512,
|
51 |
+
"never_split": null,
|
52 |
+
"pad_token": "[PAD]",
|
53 |
+
"sep_token": "[SEP]",
|
54 |
+
"strip_accents": null,
|
55 |
+
"tokenize_chinese_chars": true,
|
56 |
+
"tokenizer_class": "BertTokenizer",
|
57 |
+
"unk_token": "[UNK]"
|
58 |
+
}
|
tokenizer/vocab.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|