Spaces:
Sleeping
Sleeping
File size: 2,250 Bytes
d4cc803 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 |
from datasets import load_dataset
from transformers import AutoModelForTokenClassification, AutoTokenizer, TrainingArguments, Trainer
import torch
# Load Dataset
dataset_path = "train-lf-final.jsonl" # Ensure this file is uploaded
dataset = load_dataset("json", data_files=dataset_path)
# Split dataset into training and validation sets
dataset = dataset["train"].train_test_split(test_size=0.1)
# Define label mapping
label_list = ["O", "B-BRAND", "I-BRAND", "B-CATEGORY", "I-CATEGORY", "B-GENDER", "B-PRICE", "I-PRICE"]
label_map = {label: i for i, label in enumerate(label_list)}
# Load Tokenizer
model_name = "distilbert/distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Tokenization function
def tokenize_and_align_labels(example):
tokenized_inputs = tokenizer(example["tokens"], is_split_into_words=True, truncation=True, padding="max_length", max_length=128)
labels = []
word_ids = tokenized_inputs.word_ids()
prev_word_idx = None
for word_idx in word_ids:
if word_idx is None:
labels.append(-100)
elif word_idx != prev_word_idx:
labels.append(label_map[example["tags"][word_idx]])
else:
labels.append(label_map[example["tags"][word_idx]])
prev_word_idx = word_idx
tokenized_inputs["labels"] = labels
return tokenized_inputs
# Apply tokenization
tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)
# Load Model
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(label_list))
# Training Arguments
training_args = TrainingArguments(
output_dir="./ner_model",
evaluation_strategy="epoch",
save_strategy="epoch",
per_device_train_batch_size=8,
per_device_eval_batch_size=8,
num_train_epochs=3,
weight_decay=0.01,
push_to_hub=True,
logging_dir="./logs"
)
# Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_datasets["train"],
eval_dataset=tokenized_datasets["test"],
tokenizer=tokenizer
)
# Train the model
trainer.train()
# Push to Hugging Face Hub
model.push_to_hub("your-hf-username/distilbert-ner")
tokenizer.push_to_hub("your-hf-username/distilbert-ner") |