Spaces:

shorecode
/

gradio-3

Sleeping

App Files Files Community

Kevin Fink commited on Dec 6, 2024

Commit

915a0f9

1 Parent(s): 1e083b4

dev

Browse files

Files changed (2) hide show

app (copy).py +179 -0
app.py +39 -83

app (copy).py ADDED Viewed

	@@ -0,0 +1,179 @@

+import spaces
+import gradio as gr
+from transformers import Trainer, TrainingArguments, AutoTokenizer, AutoModelForSeq2SeqLM
+from transformers import DataCollatorForSeq2Seq
+from datasets import load_dataset, concatenate_datasets, load_from_disk
+import traceback
+from sklearn.metrics import accuracy_score
+import numpy as np
+import torch
+import os
+from huggingface_hub import login
+from peft import get_peft_model, LoraConfig
+#os.environ['HF_HOME'] = '/data/.huggingface'
+@spaces.GPU(duration=120)
+def fine_tune_model(model_name, dataset_name, hub_id, api_key, num_epochs, batch_size, lr, grad):
+    try:
+        torch.cuda.empty_cache()
+        def compute_metrics(eval_pred):
+            logits, labels = eval_pred
+            predictions = np.argmax(logits, axis=1)
+            accuracy = accuracy_score(labels, predictions)
+            return {
+                'eval_accuracy': accuracy,
+                'eval_loss': eval_pred.loss,  # If you want to include loss as well
+            }
+        login(api_key.strip())
+        lora_config = LoraConfig(
+            r=16,  # Rank of the low-rank adaptation
+            lora_alpha=32,  # Scaling factor
+            lora_dropout=0.1,  # Dropout for LoRA layers
+            bias="none"  # Bias handling
+        )
+        # Load the model and tokenizer
+        model = AutoModelForSeq2SeqLM.from_pretrained(model_name.strip(), num_labels=2, force_download=True)
+        model.gradient_checkpointing_enable()
+        #model = get_peft_model(model, lora_config)
+        # Set training arguments
+        training_args = TrainingArguments(
+            output_dir='/data/results',
+            eval_strategy="steps",  # Change this to steps
+            save_strategy='steps',
+            learning_rate=lr*0.00001,
+            per_device_train_batch_size=int(batch_size),
+            per_device_eval_batch_size=int(batch_size),
+            num_train_epochs=int(num_epochs),
+            weight_decay=0.01,
+            gradient_accumulation_steps=int(grad),
+            max_grad_norm = 1.0,
+            load_best_model_at_end=True,
+            metric_for_best_model="accuracy",
+            greater_is_better=True,
+            logging_dir='/data/logs',
+            logging_steps=10,
+            #push_to_hub=True,
+            hub_model_id=hub_id.strip(),
+            fp16=True,
+            #lr_scheduler_type='cosine',
+            save_steps=100,  # Save checkpoint every 500 steps
+            save_total_limit=3,
+        )
+        # Check if a checkpoint exists and load it
+        if os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir):
+            print("Loading model from checkpoint...")
+            model = AutoModelForSeq2SeqLM.from_pretrained(training_args.output_dir)
+        max_length = 128
+        try:
+            tokenized_train_dataset = load_from_disk(f'/data/{hub_id.strip()}_train_dataset')
+            tokenized_test_dataset = load_from_disk(f'/data/{hub_id.strip()}_test_dataset')
+            # Create Trainer
+            trainer = Trainer(
+                model=model,
+                args=training_args,
+                train_dataset=tokenized_train_dataset,
+                eval_dataset=tokenized_test_dataset,
+                compute_metrics=compute_metrics,
+                #callbacks=[LoggingCallback()],
+            )
+        except:
+            # Load the dataset
+            dataset = load_dataset(dataset_name.strip())
+            tokenizer = AutoTokenizer.from_pretrained(model_name)
+            # Tokenize the dataset
+            def tokenize_function(examples):
+                # Assuming 'text' is the input and 'target' is the expected output
+                model_inputs = tokenizer(
+                    examples['text'],
+                    max_length=max_length,  # Set to None for dynamic padding
+                    padding=True,     # Disable padding here, we will handle it later
+                    truncation=True,
+                )
+                # Setup the decoder input IDs (shifted right)
+                labels = tokenizer(
+                    examples['target'],
+                    max_length=max_length,  # Set to None for dynamic padding
+                    padding=True,     # Disable padding here, we will handle it later
+                    truncation=True,
+                    text_target=examples['target']  # Use text_target for target text
+                )
+                # Add labels to the model inputs
+                model_inputs["labels"] = labels["input_ids"]
+                return model_inputs
+            tokenized_datasets = dataset.map(tokenize_function, batched=True)
+            tokenized_datasets['train'].save_to_disk(f'/data/{hub_id.strip()}_train_dataset')
+            tokenized_datasets['test'].save_to_disk(f'/data/{hub_id.strip()}_test_dataset')
+            # Create Trainer
+            trainer = Trainer(
+                model=model,
+                args=training_args,
+                train_dataset=tokenized_datasets['train'],
+                eval_dataset=tokenized_datasets['test'],
+                compute_metrics=compute_metrics,
+                #callbacks=[LoggingCallback()],
+            )
+        # Fine-tune the model
+        trainer.train()
+        trainer.push_to_hub(commit_message="Training complete!")
+    except Exception as e:
+        return f"An error occurred: {str(e)}, TB: {traceback.format_exc()}"
+    return 'DONE!'#model
+'''
+# Define Gradio interface
+def predict(text):
+    model = AutoModelForSeq2SeqLM.from_pretrained(model_name.strip(), num_labels=2)
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
+    outputs = model(inputs)
+    predictions = outputs.logits.argmax(dim=-1)
+    return predictions.item()
+'''
+# Create Gradio interface
+try:
+    model = AutoModelForSeq2SeqLM.from_pretrained('google/t5-efficient-tiny-nh8'.strip(), num_labels=2, force_download=True)
+    iface = gr.Interface(
+        fn=fine_tune_model,
+        inputs=[
+            gr.Textbox(label="Model Name (e.g., 'google/t5-efficient-tiny-nh8')"),
+            gr.Textbox(label="Dataset Name (e.g., 'imdb')"),
+            gr.Textbox(label="HF hub to push to after training"),
+            gr.Textbox(label="HF API token"),
+            gr.Slider(minimum=1, maximum=10, value=3, label="Number of Epochs", step=1),
+            gr.Slider(minimum=1, maximum=2000, value=1, label="Batch Size", step=1),
+            gr.Slider(minimum=1, maximum=1000, value=1, label="Learning Rate (e-5)", step=1),
+            gr.Slider(minimum=1, maximum=100, value=1, label="Gradient accumulation", step=1),
+        ],
+        outputs="text",
+        title="Fine-Tune Hugging Face Model",
+        description="This interface allows you to fine-tune a Hugging Face model on a specified dataset."
+    )
+    '''
+    iface = gr.Interface(
+        fn=predict,
+        inputs=[
+            gr.Textbox(label="Query"),
+        ],
+        outputs="text",
+        title="Fine-Tune Hugging Face Model",
+        description="This interface allows you to test a fine-tune Hugging Face model."
+    )
+    '''
+    # Launch the interface
+    iface.launch()
+except Exception as e:
+    print(f"An error occurred: {str(e)}, TB: {traceback.format_exc()}")

app.py CHANGED Viewed

@@ -4,40 +4,15 @@ from transformers import Trainer, TrainingArguments, AutoTokenizer, AutoModelFor
 from transformers import DataCollatorForSeq2Seq
 from datasets import load_dataset, concatenate_datasets, load_from_disk
 import traceback
-from sklearn.metrics import accuracy_score
-import numpy as np
-import torch
 import os
 from huggingface_hub import login
-from peft import get_peft_model, LoraConfig
-#os.environ['HF_HOME'] = '/data/.huggingface'
-@spaces.GPU(duration=120)
 def fine_tune_model(model_name, dataset_name, hub_id, api_key, num_epochs, batch_size, lr, grad):
     try:
-        torch.cuda.empty_cache()
-        def compute_metrics(eval_pred):
-            logits, labels = eval_pred
-            predictions = np.argmax(logits, axis=1)
-            accuracy = accuracy_score(labels, predictions)
-            return {
-                'eval_accuracy': accuracy,
-                'eval_loss': eval_pred.loss,  # If you want to include loss as well
-            }
         login(api_key.strip())
-        lora_config = LoraConfig(
-            r=16,  # Rank of the low-rank adaptation
-            lora_alpha=32,  # Scaling factor
-            lora_dropout=0.1,  # Dropout for LoRA layers
-            bias="none"  # Bias handling
-        )
         # Load the model and tokenizer
         model = AutoModelForSeq2SeqLM.from_pretrained(model_name.strip(), num_labels=2, force_download=True)
-        model.gradient_checkpointing_enable()
-        #model = get_peft_model(model, lora_config)
         # Set training arguments
@@ -65,66 +40,48 @@ def fine_tune_model(model_name, dataset_name, hub_id, api_key, num_epochs, batch
             save_total_limit=3,
         )
         # Check if a checkpoint exists and load it
-        if os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir):
-            print("Loading model from checkpoint...")
-            model = AutoModelForSeq2SeqLM.from_pretrained(training_args.output_dir)
         max_length = 128
-        try:
-            tokenized_train_dataset = load_from_disk(f'/data/{hub_id.strip()}_train_dataset')
-            tokenized_test_dataset = load_from_disk(f'/data/{hub_id.strip()}_test_dataset')
-            # Create Trainer
-            trainer = Trainer(
-                model=model,
-                args=training_args,
-                train_dataset=tokenized_train_dataset,
-                eval_dataset=tokenized_test_dataset,
-                compute_metrics=compute_metrics,
-                #callbacks=[LoggingCallback()],
-            )
-        except:
-            # Load the dataset
-            dataset = load_dataset(dataset_name.strip())
-            tokenizer = AutoTokenizer.from_pretrained(model_name)
-            # Tokenize the dataset
-            def tokenize_function(examples):
-                # Assuming 'text' is the input and 'target' is the expected output
-                model_inputs = tokenizer(
-                    examples['text'],
-                    max_length=max_length,  # Set to None for dynamic padding
-                    padding=True,     # Disable padding here, we will handle it later
-                    truncation=True,
-                )
-                # Setup the decoder input IDs (shifted right)
-                labels = tokenizer(
-                    examples['target'],
-                    max_length=max_length,  # Set to None for dynamic padding
-                    padding=True,     # Disable padding here, we will handle it later
-                    truncation=True,
-                    text_target=examples['target']  # Use text_target for target text
-                )
-                # Add labels to the model inputs
-                model_inputs["labels"] = labels["input_ids"]
-                return model_inputs
-            tokenized_datasets = dataset.map(tokenize_function, batched=True)
-            tokenized_datasets['train'].save_to_disk(f'/data/{hub_id.strip()}_train_dataset')
-            tokenized_datasets['test'].save_to_disk(f'/data/{hub_id.strip()}_test_dataset')
-            # Create Trainer
-            trainer = Trainer(
-                model=model,
-                args=training_args,
-                train_dataset=tokenized_datasets['train'],
-                eval_dataset=tokenized_datasets['test'],
-                compute_metrics=compute_metrics,
-                #callbacks=[LoggingCallback()],
-            )
         # Fine-tune the model
         trainer.train()
@@ -144,7 +101,6 @@ def predict(text):
 '''
 # Create Gradio interface
 try:
-    model = AutoModelForSeq2SeqLM.from_pretrained('google/t5-efficient-tiny-nh8'.strip(), num_labels=2, force_download=True)
     iface = gr.Interface(
         fn=fine_tune_model,
         inputs=[

 from transformers import DataCollatorForSeq2Seq
 from datasets import load_dataset, concatenate_datasets, load_from_disk
 import traceback
 import os
 from huggingface_hub import login
 def fine_tune_model(model_name, dataset_name, hub_id, api_key, num_epochs, batch_size, lr, grad):
     try:
         login(api_key.strip())
         # Load the model and tokenizer
         model = AutoModelForSeq2SeqLM.from_pretrained(model_name.strip(), num_labels=2, force_download=True)
         # Set training arguments
             save_total_limit=3,
         )
         # Check if a checkpoint exists and load it
         max_length = 128
+        # Load the dataset
+        dataset = load_dataset(dataset_name.strip())
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
+        # Tokenize the dataset
+        def tokenize_function(examples):
+            # Assuming 'text' is the input and 'target' is the expected output
+            model_inputs = tokenizer(
+                examples['text'],
+                max_length=max_length,  # Set to None for dynamic padding
+                padding=True,     # Disable padding here, we will handle it later
+                truncation=True,
+            )
+            # Setup the decoder input IDs (shifted right)
+            labels = tokenizer(
+                examples['target'],
+                max_length=max_length,  # Set to None for dynamic padding
+                padding=True,     # Disable padding here, we will handle it later
+                truncation=True,
+                text_target=examples['target']  # Use text_target for target text
+            )
+            # Add labels to the model inputs
+            model_inputs["labels"] = labels["input_ids"]
+        tokenized_datasets = dataset.map(tokenize_function, batched=True)
+        tokenized_datasets['train'].save_to_disk(f'/data/{hub_id.strip()}_train_dataset')
+        tokenized_datasets['test'].save_to_disk(f'/data/{hub_id.strip()}_test_dataset')
+        # Create Trainer
+        trainer = Trainer(
+            model=model,
+            args=training_args,
+            train_dataset=tokenized_datasets['train'],
+            eval_dataset=tokenized_datasets['test'],
+            compute_metrics=compute_metrics,
+            #callbacks=[LoggingCallback()],
+        )
         # Fine-tune the model
         trainer.train()
 '''
 # Create Gradio interface
 try:
     iface = gr.Interface(
         fn=fine_tune_model,
         inputs=[