Spaces:

Manasa1
/

Jacks_Clone

Sleeping

App Files Files Community

Manasa1 commited on Nov 15, 2024

Commit

a6df729

verified ·

1 Parent(s): 8ddbef4

Update app.py

Browse files

Files changed (1) hide show

app.py +37 -32

app.py CHANGED Viewed

@@ -42,46 +42,51 @@ if uploaded_file is not None:
     # Step 4: Fine-tune a model on the extracted tweets
     def fine_tune_model(tweets):
         # Convert tweets to a DataFrame and Dataset
         df = pd.DataFrame(tweets, columns=["text"])
         tweet_dataset = Dataset.from_pandas(df)
-        # Load model and tokenizer
         model_name = "gpt2"  # Replace with a suitable model if needed
         tokenizer = AutoTokenizer.from_pretrained(model_name)
         model = AutoModelForCausalLM.from_pretrained(model_name)
-        # Tokenize the dataset
-        def tokenize_function(examples):
-            return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)
-        tokenized_tweets = tweet_dataset.map(tokenize_function, batched=True)
-        # Training arguments
-        training_args = TrainingArguments(
-            output_dir="./fine_tuned_tweet_model",
-            per_device_train_batch_size=4,
-            num_train_epochs=3,
-            save_steps=10_000,
-            save_total_limit=1,
-            logging_dir='./logs',
-        )
-        # Initialize the Trainer
-        trainer = Trainer(
-            model=model,
-            args=training_args,
-            train_dataset=tokenized_tweets,
-        )
-        # Fine-tune the model
-        trainer.train()
-        # Save the fine-tuned model
-        model.save_pretrained("fine_tuned_tweet_model")
-        tokenizer.save_pretrained("fine_tuned_tweet_model")
-        return model, tokenizer
     # Trigger fine-tuning and notify user
     with st.spinner("Fine-tuning model..."):

     # Step 4: Fine-tune a model on the extracted tweets
     def fine_tune_model(tweets):
         # Convert tweets to a DataFrame and Dataset
         df = pd.DataFrame(tweets, columns=["text"])
         tweet_dataset = Dataset.from_pandas(df)
+       # Load model and tokenizer
         model_name = "gpt2"  # Replace with a suitable model if needed
         tokenizer = AutoTokenizer.from_pretrained(model_name)
         model = AutoModelForCausalLM.from_pretrained(model_name)
+       # Set the pad token
+        tokenizer.pad_token = tokenizer.eos_token  # Use eos_token as pad_token
+    # Tokenize the dataset
+    def tokenize_function(examples):
+        return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)
+    tokenized_tweets = tweet_dataset.map(tokenize_function, batched=True)
+    # Training arguments
+    training_args = TrainingArguments(
+        output_dir="./fine_tuned_tweet_model",
+        per_device_train_batch_size=4,
+        num_train_epochs=3,
+        save_steps=10_000,
+        save_total_limit=1,
+        logging_dir='./logs',
+    )
+    # Initialize the Trainer
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=tokenized_tweets,
+    )
+    # Fine-tune the model
+    trainer.train()
+    # Save the fine-tuned model
+    model.save_pretrained("fine_tuned_tweet_model")
+    tokenizer.save_pretrained("fine_tuned_tweet_model")
+    return model, tokenizer
     # Trigger fine-tuning and notify user
     with st.spinner("Fine-tuning model..."):