kasim90 commited on
Commit
66c7691
·
verified ·
1 Parent(s): 50be0cf

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +31 -49
app.py CHANGED
@@ -1,84 +1,66 @@
1
  import torch
2
- import spaces
3
  from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
4
  from peft import LoraConfig, get_peft_model
5
  from datasets import load_dataset
 
6
 
7
  # === 1️⃣ MODEL VE TOKENIZER YÜKLEME ===
8
- MODEL_NAME = "mistralai/Mistral-7B-v0.1"
9
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
10
 
11
- # === 2️⃣ LoRA AYARLARI ===
 
 
 
 
12
  lora_config = LoraConfig(
13
- r=8,
14
- lora_alpha=32,
15
- lora_dropout=0.1,
16
  bias="none",
17
- target_modules=["q_proj", "v_proj"],
18
  )
 
19
 
20
- # === 3️⃣ VERİ SETİ ===
21
- dataset = load_dataset("oscar", "unshuffled_deduplicated_tr", split="train", streaming=True, trust_remote_code=True)
22
- dataset = dataset.shuffle(seed=42).take(10000)
23
 
 
24
  def tokenize_function(examples):
25
  return tokenizer(examples["text"], truncation=True, max_length=512)
26
 
27
- tokenized_datasets = dataset.map(tokenize_function, batched=True)
28
 
29
- # === 4️⃣ EĞİTİM AYARLARI ===
30
- # === 4️⃣ EĞİTİM AYARLARI ===
31
- num_train_steps = 10000 // 1 # Örnek başına 1 batch, toplam 10.000 veri, her batch büyüklüğü 1
32
- max_steps = num_train_steps * 1 # 1 epoch için adım sayısı (10000 veri seti için)
 
 
33
 
34
  training_args = TrainingArguments(
35
  output_dir="./mistral_lora",
36
- per_device_train_batch_size=1,
37
- gradient_accumulation_steps=16,
38
- learning_rate=5e-4,
39
- num_train_epochs=1,
40
- max_steps=max_steps, # max_steps parametresini ekliyoruz
41
  save_steps=500,
42
  save_total_limit=2,
43
  logging_dir="./logs",
44
  logging_steps=10,
45
  optim="adamw_torch",
46
- no_cuda=True,
47
  )
48
 
49
-
50
- # === 5️⃣ GPU BAŞLATMA VE EĞİTİM ===
51
- @spaces.GPU
52
  def train_model():
53
- device = "cuda" if torch.cuda.is_available() else "cpu"
54
-
55
- # Modeli burada yükle
56
- model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, torch_dtype=torch.float32).to(device)
57
- model = get_peft_model(model, lora_config)
58
-
59
- # TrainingArguments burada tanımlandı!
60
- training_args = TrainingArguments(
61
- output_dir="./mistral_lora",
62
- per_device_train_batch_size=1,
63
- gradient_accumulation_steps=16,
64
- learning_rate=5e-4,
65
- num_train_epochs=1,
66
- save_steps=500,
67
- save_total_limit=2,
68
- logging_dir="./logs",
69
- logging_steps=10,
70
- optim="adamw_torch",
71
- )
72
-
73
  trainer = Trainer(
74
  model=model,
75
  args=training_args,
76
  train_dataset=tokenized_datasets,
77
  )
78
  trainer.train()
79
- return "✅ Model Eğitimi Tamamlandı!"
80
-
81
 
82
- # === 6️⃣ BAŞLATMA ===
83
- if __name__ == "__main__":
84
- train_model() # Eğitimi başlat
 
1
  import torch
 
2
  from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
3
  from peft import LoraConfig, get_peft_model
4
  from datasets import load_dataset
5
+ import gradio as gr
6
 
7
  # === 1️⃣ MODEL VE TOKENIZER YÜKLEME ===
8
+ MODEL_NAME = "mistralai/Mistral-7B-v0.1" # Hugging Face model adı
9
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
10
 
11
+ # === 2️⃣ CPU OPTİMİZASYONU ===
12
+ torch_dtype = torch.float32 # CPU için uygun dtype
13
+ model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, torch_dtype=torch_dtype)
14
+
15
+ # === 3️⃣ LoRA AYARLARI ===
16
  lora_config = LoraConfig(
17
+ r=8,
18
+ lora_alpha=32,
19
+ lora_dropout=0.1,
20
  bias="none",
21
+ target_modules=["q_proj", "v_proj"],
22
  )
23
+ model = get_peft_model(model, lora_config)
24
 
25
+ # === 4️⃣ VERİ SETİ ===
26
+ dataset = load_dataset("oscar", "unshuffled_deduplicated_tr", trust_remote_code=True) # trust_remote_code=True
27
+ subset = dataset["train"].shuffle(seed=42).select(range(10000)) # Küçük subset seçiyoruz (10.000 örnek)
28
 
29
+ # === 5️⃣ TOKENLEŞTİRME FONKSİYONU ===
30
  def tokenize_function(examples):
31
  return tokenizer(examples["text"], truncation=True, max_length=512)
32
 
33
+ tokenized_datasets = subset.map(tokenize_function, batched=True)
34
 
35
+ # === 6️⃣ EĞİTİM AYARLARI ===
36
+ # Eğitimde kaç adım olduğunu hesaplayalım
37
+ train_size = len(tokenized_datasets) # 10,000 örnek
38
+ batch_size = 1 # Batch size 1
39
+ num_epochs = 1 # 1 epoch eğitimi
40
+ max_steps = (train_size // batch_size) * num_epochs # max_steps hesapla
41
 
42
  training_args = TrainingArguments(
43
  output_dir="./mistral_lora",
44
+ per_device_train_batch_size=1,
45
+ gradient_accumulation_steps=16,
46
+ learning_rate=5e-4,
47
+ num_train_epochs=1,
48
+ max_steps=max_steps, # Buraya max_steps parametresini ekliyoruz
49
  save_steps=500,
50
  save_total_limit=2,
51
  logging_dir="./logs",
52
  logging_steps=10,
53
  optim="adamw_torch",
54
+ no_cuda=True, # GPU kullanılmıyor
55
  )
56
 
57
+ # === 7️⃣ MODEL EĞİTİMİ ===
 
 
58
  def train_model():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  trainer = Trainer(
60
  model=model,
61
  args=training_args,
62
  train_dataset=tokenized_datasets,
63
  )
64
  trainer.train()
 
 
65
 
66
+ train_model() # Eğitimi başlat