DINGOLANI commited on
Commit
d4cc803
·
verified ·
1 Parent(s): fa0a47b

Create train.py

Browse files
Files changed (1) hide show
  1. train.py +72 -0
train.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset
2
+ from transformers import AutoModelForTokenClassification, AutoTokenizer, TrainingArguments, Trainer
3
+ import torch
4
+
5
+ # Load Dataset
6
+ dataset_path = "train-lf-final.jsonl" # Ensure this file is uploaded
7
+ dataset = load_dataset("json", data_files=dataset_path)
8
+
9
+ # Split dataset into training and validation sets
10
+ dataset = dataset["train"].train_test_split(test_size=0.1)
11
+
12
+ # Define label mapping
13
+ label_list = ["O", "B-BRAND", "I-BRAND", "B-CATEGORY", "I-CATEGORY", "B-GENDER", "B-PRICE", "I-PRICE"]
14
+ label_map = {label: i for i, label in enumerate(label_list)}
15
+
16
+ # Load Tokenizer
17
+ model_name = "distilbert/distilbert-base-uncased"
18
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
19
+
20
+ # Tokenization function
21
+ def tokenize_and_align_labels(example):
22
+ tokenized_inputs = tokenizer(example["tokens"], is_split_into_words=True, truncation=True, padding="max_length", max_length=128)
23
+
24
+ labels = []
25
+ word_ids = tokenized_inputs.word_ids()
26
+ prev_word_idx = None
27
+ for word_idx in word_ids:
28
+ if word_idx is None:
29
+ labels.append(-100)
30
+ elif word_idx != prev_word_idx:
31
+ labels.append(label_map[example["tags"][word_idx]])
32
+ else:
33
+ labels.append(label_map[example["tags"][word_idx]])
34
+ prev_word_idx = word_idx
35
+
36
+ tokenized_inputs["labels"] = labels
37
+ return tokenized_inputs
38
+
39
+ # Apply tokenization
40
+ tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)
41
+
42
+ # Load Model
43
+ model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(label_list))
44
+
45
+ # Training Arguments
46
+ training_args = TrainingArguments(
47
+ output_dir="./ner_model",
48
+ evaluation_strategy="epoch",
49
+ save_strategy="epoch",
50
+ per_device_train_batch_size=8,
51
+ per_device_eval_batch_size=8,
52
+ num_train_epochs=3,
53
+ weight_decay=0.01,
54
+ push_to_hub=True,
55
+ logging_dir="./logs"
56
+ )
57
+
58
+ # Trainer
59
+ trainer = Trainer(
60
+ model=model,
61
+ args=training_args,
62
+ train_dataset=tokenized_datasets["train"],
63
+ eval_dataset=tokenized_datasets["test"],
64
+ tokenizer=tokenizer
65
+ )
66
+
67
+ # Train the model
68
+ trainer.train()
69
+
70
+ # Push to Hugging Face Hub
71
+ model.push_to_hub("your-hf-username/distilbert-ner")
72
+ tokenizer.push_to_hub("your-hf-username/distilbert-ner")