QueryAnalyzerV2 / train.py
DINGOLANI's picture
Create train.py
736b778 verified
raw
history blame
1.85 kB
import pandas as pd
import torch
import re
from datasets import Dataset
from transformers import (
AutoModelForTokenClassification,
AutoTokenizer,
Trainer,
TrainingArguments,
DataCollatorForTokenClassification,
)
from huggingface_hub import notebook_login
# Login to Hugging Face Hub (Make sure your Space is set to private if needed)
notebook_login()
# Step 1: Load Luxury Fashion Dataset (Replace with actual dataset)
df = pd.read_csv("luxury_apparel_data.csv") # Update with correct dataset file
# Keep only relevant columns
df = df[['brand', 'category', 'description', 'price']].dropna()
# Generate search queries from dataset
df['query'] = df.apply(lambda x: f"{x['brand']} {x['category']} under {x['price']} AED", axis=1)
# Step 2: Tokenization
model_name = "dslim/bert-base-NER"
tokenizer = AutoTokenizer.from_pretrained(model_name)
def tokenize_batch(batch):
return tokenizer(batch['query'], padding=True, truncation=True)
# Convert dataframe into Hugging Face dataset
hf_dataset = Dataset.from_pandas(df[['query']])
hf_dataset = hf_dataset.map(tokenize_batch, batched=True)
# Step 3: Fine-tune the Pretrained NER Model
model = AutoModelForTokenClassification.from_pretrained(model_name)
training_args = TrainingArguments(
output_dir="./luxury_ner_model",
evaluation_strategy="epoch",
save_strategy="epoch",
per_device_train_batch_size=8,
per_device_eval_batch_size=8,
num_train_epochs=3,
logging_dir="./logs",
logging_steps=500,
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=hf_dataset,
eval_dataset=hf_dataset,
tokenizer=tokenizer,
data_collator=DataCollatorForTokenClassification(tokenizer),
)
trainer.train()
# Save model to Hugging Face Hub
model.push_to_hub("luxury-fashion-ner")
tokenizer.push_to_hub("luxury-fashion-ner")