Manasa1 commited on
Commit
a6df729
·
verified ·
1 Parent(s): 8ddbef4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +37 -32
app.py CHANGED
@@ -42,46 +42,51 @@ if uploaded_file is not None:
42
 
43
  # Step 4: Fine-tune a model on the extracted tweets
44
  def fine_tune_model(tweets):
 
45
  # Convert tweets to a DataFrame and Dataset
46
  df = pd.DataFrame(tweets, columns=["text"])
47
  tweet_dataset = Dataset.from_pandas(df)
48
 
49
- # Load model and tokenizer
50
  model_name = "gpt2" # Replace with a suitable model if needed
51
  tokenizer = AutoTokenizer.from_pretrained(model_name)
52
  model = AutoModelForCausalLM.from_pretrained(model_name)
53
 
54
- # Tokenize the dataset
55
- def tokenize_function(examples):
56
- return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)
57
-
58
- tokenized_tweets = tweet_dataset.map(tokenize_function, batched=True)
59
-
60
- # Training arguments
61
- training_args = TrainingArguments(
62
- output_dir="./fine_tuned_tweet_model",
63
- per_device_train_batch_size=4,
64
- num_train_epochs=3,
65
- save_steps=10_000,
66
- save_total_limit=1,
67
- logging_dir='./logs',
68
- )
69
-
70
- # Initialize the Trainer
71
- trainer = Trainer(
72
- model=model,
73
- args=training_args,
74
- train_dataset=tokenized_tweets,
75
- )
76
-
77
- # Fine-tune the model
78
- trainer.train()
79
-
80
- # Save the fine-tuned model
81
- model.save_pretrained("fine_tuned_tweet_model")
82
- tokenizer.save_pretrained("fine_tuned_tweet_model")
83
-
84
- return model, tokenizer
 
 
 
 
85
 
86
  # Trigger fine-tuning and notify user
87
  with st.spinner("Fine-tuning model..."):
 
42
 
43
  # Step 4: Fine-tune a model on the extracted tweets
44
  def fine_tune_model(tweets):
45
+
46
  # Convert tweets to a DataFrame and Dataset
47
  df = pd.DataFrame(tweets, columns=["text"])
48
  tweet_dataset = Dataset.from_pandas(df)
49
 
50
+ # Load model and tokenizer
51
  model_name = "gpt2" # Replace with a suitable model if needed
52
  tokenizer = AutoTokenizer.from_pretrained(model_name)
53
  model = AutoModelForCausalLM.from_pretrained(model_name)
54
 
55
+ # Set the pad token
56
+ tokenizer.pad_token = tokenizer.eos_token # Use eos_token as pad_token
57
+
58
+ # Tokenize the dataset
59
+ def tokenize_function(examples):
60
+ return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)
61
+
62
+ tokenized_tweets = tweet_dataset.map(tokenize_function, batched=True)
63
+
64
+ # Training arguments
65
+ training_args = TrainingArguments(
66
+ output_dir="./fine_tuned_tweet_model",
67
+ per_device_train_batch_size=4,
68
+ num_train_epochs=3,
69
+ save_steps=10_000,
70
+ save_total_limit=1,
71
+ logging_dir='./logs',
72
+ )
73
+
74
+ # Initialize the Trainer
75
+ trainer = Trainer(
76
+ model=model,
77
+ args=training_args,
78
+ train_dataset=tokenized_tweets,
79
+ )
80
+
81
+ # Fine-tune the model
82
+ trainer.train()
83
+
84
+ # Save the fine-tuned model
85
+ model.save_pretrained("fine_tuned_tweet_model")
86
+ tokenizer.save_pretrained("fine_tuned_tweet_model")
87
+
88
+ return model, tokenizer
89
+
90
 
91
  # Trigger fine-tuning and notify user
92
  with st.spinner("Fine-tuning model..."):