import pandas as pd import torch import re from datasets import Dataset from transformers import ( AutoModelForTokenClassification, AutoTokenizer, Trainer, TrainingArguments, DataCollatorForTokenClassification, ) from huggingface_hub import notebook_login # Login to Hugging Face Hub (Make sure your Space is set to private if needed) notebook_login() # Step 1: Load Luxury Fashion Dataset (Replace with actual dataset) df = pd.read_csv("luxury_apparel_data.csv") # Update with correct dataset file # Keep only relevant columns df = df[['brand', 'category', 'description', 'price']].dropna() # Generate search queries from dataset df['query'] = df.apply(lambda x: f"{x['brand']} {x['category']} under {x['price']} AED", axis=1) # Step 2: Tokenization model_name = "dslim/bert-base-NER" tokenizer = AutoTokenizer.from_pretrained(model_name) def tokenize_batch(batch): return tokenizer(batch['query'], padding=True, truncation=True) # Convert dataframe into Hugging Face dataset hf_dataset = Dataset.from_pandas(df[['query']]) hf_dataset = hf_dataset.map(tokenize_batch, batched=True) # Step 3: Fine-tune the Pretrained NER Model model = AutoModelForTokenClassification.from_pretrained(model_name) training_args = TrainingArguments( output_dir="./luxury_ner_model", evaluation_strategy="epoch", save_strategy="epoch", per_device_train_batch_size=8, per_device_eval_batch_size=8, num_train_epochs=3, logging_dir="./logs", logging_steps=500, ) trainer = Trainer( model=model, args=training_args, train_dataset=hf_dataset, eval_dataset=hf_dataset, tokenizer=tokenizer, data_collator=DataCollatorForTokenClassification(tokenizer), ) trainer.train() # Save model to Hugging Face Hub model.push_to_hub("luxury-fashion-ner") tokenizer.push_to_hub("luxury-fashion-ner")