duoquote commited on
Commit
696ac96
1 Parent(s): 06410f5

Refactor labels and update model configuration

Browse files
labels.json CHANGED
@@ -1 +1 @@
1
- {"1": "B-\u00dclke", "2": "I-\u00dclke", "3": "B-\u0130l", "4": "I-\u0130l", "5": "B-\u0130l\u00e7e", "6": "I-\u0130l\u00e7e", "7": "B-Mahalle", "8": "I-Mahalle", "9": "B-Cadde", "10": "I-Cadde", "11": "B-Sokak", "12": "I-Sokak", "13": "B-Bina Ad\u0131", "14": "I-Bina Ad\u0131", "15": "B-Bina Numaras\u0131", "16": "I-Bina Numaras\u0131", "17": "B-Yer Ad\u0131", "18": "I-Yer Ad\u0131", "19": "B-Site", "20": "I-Site", "21": "B-Adres Detay", "22": "I-Adres Detay", "23": "B-Blok No", "24": "I-Blok No", "25": "B-Bulvar", "26": "I-Bulvar", "27": "B-Daire No", "28": "I-Daire No", "29": "B-Posta Kodu", "30": "I-Posta Kodu", "31": "B-Kat", "32": "I-Kat", "0": "O"}
 
1
+ {"1": "\u00dclke", "2": "\u0130l", "3": "\u0130l\u00e7e", "4": "Mahalle", "5": "Cadde", "6": "Sokak", "7": "Bina Ad\u0131", "8": "Bina Numaras\u0131", "9": "Yer Ad\u0131", "10": "Site", "11": "Adres Detay", "12": "Blok No", "13": "Bulvar", "14": "Daire No", "15": "Posta Kodu", "16": "Kat", "0": "[PAD]", "17": "[UNK]"}
model/config.json ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "dbmdz/bert-base-turkish-cased",
3
+ "architectures": [
4
+ "BertForTokenClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "classifier_dropout": null,
8
+ "hidden_act": "gelu",
9
+ "hidden_dropout_prob": 0.1,
10
+ "hidden_size": 768,
11
+ "id2label": {
12
+ "0": "[PAD]",
13
+ "1": "\u00dclke",
14
+ "2": "\u0130l",
15
+ "3": "\u0130l\u00e7e",
16
+ "4": "Mahalle",
17
+ "5": "Cadde",
18
+ "6": "Sokak",
19
+ "7": "Bina Ad\u0131",
20
+ "8": "Bina Numaras\u0131",
21
+ "9": "Yer Ad\u0131",
22
+ "10": "Site",
23
+ "11": "Adres Detay",
24
+ "12": "Blok No",
25
+ "13": "Bulvar",
26
+ "14": "Daire No",
27
+ "15": "Posta Kodu",
28
+ "16": "Kat",
29
+ "17": "[UNK]"
30
+ },
31
+ "initializer_range": 0.02,
32
+ "intermediate_size": 3072,
33
+ "label2id": {
34
+ "Adres Detay": 11,
35
+ "Bina Ad\u0131": 7,
36
+ "Bina Numaras\u0131": 8,
37
+ "Blok No": 12,
38
+ "Bulvar": 13,
39
+ "Cadde": 5,
40
+ "Daire No": 14,
41
+ "Kat": 16,
42
+ "Mahalle": 4,
43
+ "Posta Kodu": 15,
44
+ "Site": 10,
45
+ "Sokak": 6,
46
+ "Yer Ad\u0131": 9,
47
+ "[PAD]": 0,
48
+ "[UNK]": 17,
49
+ "\u00dclke": 1,
50
+ "\u0130l": 2,
51
+ "\u0130l\u00e7e": 3
52
+ },
53
+ "layer_norm_eps": 1e-12,
54
+ "max_position_embeddings": 512,
55
+ "model_type": "bert",
56
+ "num_attention_heads": 12,
57
+ "num_hidden_layers": 12,
58
+ "pad_token_id": 0,
59
+ "position_embedding_type": "absolute",
60
+ "torch_dtype": "float32",
61
+ "transformers_version": "4.37.0",
62
+ "type_vocab_size": 2,
63
+ "use_cache": true,
64
+ "vocab_size": 32000
65
+ }
model/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3f61796d22b89ac6c4b5bf7cd5932198148f721b23b684a10950709b692328c6
3
+ size 440185728
model/special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
model/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
model/tokenizer_config.json ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "4": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": true,
45
+ "cls_token": "[CLS]",
46
+ "do_basic_tokenize": true,
47
+ "do_lower_case": false,
48
+ "mask_token": "[MASK]",
49
+ "max_len": 512,
50
+ "model_max_length": 512,
51
+ "never_split": null,
52
+ "pad_token": "[PAD]",
53
+ "sep_token": "[SEP]",
54
+ "strip_accents": null,
55
+ "tokenize_chinese_chars": true,
56
+ "tokenizer_class": "BertTokenizer",
57
+ "unk_token": "[UNK]"
58
+ }
model/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:300abae98dafa01f4daa08ba322e5f0ec434e9a6823866fb12dde9fb1397ba62
3
+ size 4664
model/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
train.py CHANGED
@@ -63,16 +63,20 @@ def load_data():
63
  return labels, [orjson.loads(line) for line in data.split("\n") if line]
64
 
65
  labels, data = load_data()
66
- label_to_id = {}
67
- for i, label in enumerate(labels):
68
- label_to_id["B-" + label["text"]] = i * 2 + 1
69
- label_to_id["I-" + label["text"]] = i * 2 + 2
70
- label_to_id["O"] = 0
 
 
 
71
  id_to_label = {v: k for k, v in label_to_id.items()}
72
 
73
  tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-turkish-cased")
74
  model = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-base-turkish-cased", num_labels=len(label_to_id)).to(device)
75
-
 
76
 
77
  from datasets import DatasetDict, Dataset
78
 
@@ -93,20 +97,18 @@ def preprocess_data(item, tokenizer, label_to_id):
93
  attention_mask = inputs["attention_mask"]
94
  offset_mapping = inputs["offset_mapping"]
95
 
96
- labels = ["O"] * 128
97
- last_label = "O"
98
  for token_idx, [off_start, off_end] in enumerate(offset_mapping[0]):
99
  if off_start == off_end:
100
  continue
101
 
102
  for start, end, label in item['label']:
103
  if start <= off_start and off_end <= end:
104
- if last_label == label:
105
- labels[token_idx] = "I-" + label
106
- else:
107
- labels[token_idx] = "B-" + label
108
- last_label = label
109
  break
 
 
 
110
 
111
  # Convert labels to ids
112
  labels = [label_to_id[label] for label in labels]
@@ -130,7 +132,6 @@ class AddressDataset(Dataset):
130
  return {key: torch.tensor(val) for key, val in item.items()}
131
 
132
 
133
-
134
  dataset = Dataset.from_generator(
135
  lambda: (preprocess_data(item, tokenizer, label_to_id) for item in data),
136
  )
@@ -165,8 +166,8 @@ def compute_metrics(pred, id_to_label):
165
  labels = [[id_to_label[label_id] for label_id in label_ids] for label_ids in labels]
166
  preds = [[id_to_label[pred] for pred in preds] for preds in preds]
167
 
168
- labels = [label for label in labels if label != "O"]
169
- preds = [pred for pred in preds if pred != "O"]
170
 
171
  mlb = MultiLabelBinarizer()
172
  mlb.fit([id_to_label.values()])
@@ -193,7 +194,4 @@ trainer = Trainer(
193
  trainer.train()
194
  trainer.evaluate()
195
 
196
- with open("./labels.json", "w") as f:
197
- json.dump(id_to_label, f)
198
-
199
  trainer.save_model("./model")
 
63
  return labels, [orjson.loads(line) for line in data.split("\n") if line]
64
 
65
  labels, data = load_data()
66
+ # label_to_id = {}
67
+ # for i, label in enumerate(labels):
68
+ # label_to_id["B-" + label["text"]] = i * 2 + 1
69
+ # label_to_id["I-" + label["text"]] = i * 2 + 2
70
+ # label_to_id["O"] = 0
71
+ label_to_id = {label["text"]: i + 1 for i, label in enumerate(labels)}
72
+ label_to_id["[PAD]"] = 0
73
+ label_to_id["[UNK]"] = len(label_to_id)
74
  id_to_label = {v: k for k, v in label_to_id.items()}
75
 
76
  tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-turkish-cased")
77
  model = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-base-turkish-cased", num_labels=len(label_to_id)).to(device)
78
+ model.config.id2label = id_to_label
79
+ model.config.label2id = label_to_id
80
 
81
  from datasets import DatasetDict, Dataset
82
 
 
97
  attention_mask = inputs["attention_mask"]
98
  offset_mapping = inputs["offset_mapping"]
99
 
100
+ labels = ["[PAD]"] * 128
 
101
  for token_idx, [off_start, off_end] in enumerate(offset_mapping[0]):
102
  if off_start == off_end:
103
  continue
104
 
105
  for start, end, label in item['label']:
106
  if start <= off_start and off_end <= end:
107
+ labels[token_idx] = label
 
 
 
 
108
  break
109
+
110
+ if labels[token_idx] == "[PAD]":
111
+ labels[token_idx] = "[UNK]"
112
 
113
  # Convert labels to ids
114
  labels = [label_to_id[label] for label in labels]
 
132
  return {key: torch.tensor(val) for key, val in item.items()}
133
 
134
 
 
135
  dataset = Dataset.from_generator(
136
  lambda: (preprocess_data(item, tokenizer, label_to_id) for item in data),
137
  )
 
166
  labels = [[id_to_label[label_id] for label_id in label_ids] for label_ids in labels]
167
  preds = [[id_to_label[pred] for pred in preds] for preds in preds]
168
 
169
+ labels = [set(label) for label in labels]
170
+ preds = [set(pred) for pred in preds]
171
 
172
  mlb = MultiLabelBinarizer()
173
  mlb.fit([id_to_label.values()])
 
194
  trainer.train()
195
  trainer.evaluate()
196
 
 
 
 
197
  trainer.save_model("./model")