DINGOLANI commited on
Commit
736b778
·
verified ·
1 Parent(s): 85a27c5

Create train.py

Browse files
Files changed (1) hide show
  1. train.py +64 -0
train.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import torch
3
+ import re
4
+ from datasets import Dataset
5
+ from transformers import (
6
+ AutoModelForTokenClassification,
7
+ AutoTokenizer,
8
+ Trainer,
9
+ TrainingArguments,
10
+ DataCollatorForTokenClassification,
11
+ )
12
+ from huggingface_hub import notebook_login
13
+
14
+ # Login to Hugging Face Hub (Make sure your Space is set to private if needed)
15
+ notebook_login()
16
+
17
+ # Step 1: Load Luxury Fashion Dataset (Replace with actual dataset)
18
+ df = pd.read_csv("luxury_apparel_data.csv") # Update with correct dataset file
19
+
20
+ # Keep only relevant columns
21
+ df = df[['brand', 'category', 'description', 'price']].dropna()
22
+
23
+ # Generate search queries from dataset
24
+ df['query'] = df.apply(lambda x: f"{x['brand']} {x['category']} under {x['price']} AED", axis=1)
25
+
26
+ # Step 2: Tokenization
27
+ model_name = "dslim/bert-base-NER"
28
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
29
+
30
+ def tokenize_batch(batch):
31
+ return tokenizer(batch['query'], padding=True, truncation=True)
32
+
33
+ # Convert dataframe into Hugging Face dataset
34
+ hf_dataset = Dataset.from_pandas(df[['query']])
35
+ hf_dataset = hf_dataset.map(tokenize_batch, batched=True)
36
+
37
+ # Step 3: Fine-tune the Pretrained NER Model
38
+ model = AutoModelForTokenClassification.from_pretrained(model_name)
39
+
40
+ training_args = TrainingArguments(
41
+ output_dir="./luxury_ner_model",
42
+ evaluation_strategy="epoch",
43
+ save_strategy="epoch",
44
+ per_device_train_batch_size=8,
45
+ per_device_eval_batch_size=8,
46
+ num_train_epochs=3,
47
+ logging_dir="./logs",
48
+ logging_steps=500,
49
+ )
50
+
51
+ trainer = Trainer(
52
+ model=model,
53
+ args=training_args,
54
+ train_dataset=hf_dataset,
55
+ eval_dataset=hf_dataset,
56
+ tokenizer=tokenizer,
57
+ data_collator=DataCollatorForTokenClassification(tokenizer),
58
+ )
59
+
60
+ trainer.train()
61
+
62
+ # Save model to Hugging Face Hub
63
+ model.push_to_hub("luxury-fashion-ner")
64
+ tokenizer.push_to_hub("luxury-fashion-ner")