Spaces:
Sleeping
Sleeping
File size: 5,376 Bytes
fe2a0f2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 |
import pandas as pd
import together
import json
import os
from sklearn.model_selection import train_test_split
class ModelTrainer:
def __init__(self, api_key):
# Initialize Together AI with your API key
together.api_key = api_key
def prepare_data(self, csv_path):
"""Prepare data from CSV file for fine-tuning"""
# Read the CSV file
print("Loading CSV file...")
df = pd.read_csv(csv_path, encoding='utf-8')
# Print column names to help debug
print("Available columns in CSV:", df.columns.tolist())
# Identify text and label columns
text_column = self._get_text_column(df)
label_column = self._get_label_column(df)
print(f"Using '{text_column}' as text column and '{label_column}' as label column")
# Split data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)
print(f"Training samples: {len(train_df)}, Validation samples: {len(val_df)}")
# Convert to Together AI format
train_data = self._convert_to_together_format(train_df, text_column, label_column)
val_data = self._convert_to_together_format(val_df, text_column, label_column)
# Save to jsonl files
self._save_jsonl(train_data, 'training_data.jsonl')
self._save_jsonl(val_data, 'validation_data.jsonl')
return 'training_data.jsonl', 'validation_data.jsonl'
def _get_text_column(self, df):
"""Identify the text column from common names"""
text_column_options = ['text', 'message', 'content', 'Text', 'MESSAGE', 'CONTENT']
for col in text_column_options:
if col in df.columns:
return col
# If no match found, use the first column
return df.columns[0]
def _get_label_column(self, df):
"""Identify the label column from common names"""
label_column_options = ['label', 'Label', 'class', 'Class', 'target', 'Target']
for col in label_column_options:
if col in df.columns:
return col
# If no match found, use the last column
return df.columns[-1]
def _convert_to_together_format(self, df, text_column, label_column):
"""Convert DataFrame to Together AI format"""
formatted_data = []
for _, row in df.iterrows():
prompt = (
"Analyze the following text and determine if it's a scam or not.\n\n"
f"Text: {row[text_column]}\n\n"
"Is this a scam? "
)
# Convert label to yes/no response
completion = "Yes" if int(row[label_column]) == 1 else "No"
formatted_data.append({
"prompt": prompt,
"completion": completion
})
return formatted_data
def _save_jsonl(self, data, filename):
"""Save data in JSONL format"""
with open(filename, 'w', encoding='utf-8') as f:
for item in data:
f.write(json.dumps(item) + '\n')
def upload_file(self, file_path):
"""Upload file to Together AI"""
print(f"Uploading {file_path}...")
result = together.Files.upload(file_path)
print(f"File uploaded with ID: {result['id']}")
return result['id']
def create_fine_tuning_job(self, training_file_id, validation_file_id):
"""Create and start fine-tuning job"""
job_params = {
"training_file": training_file_id,
"validation_file": validation_file_id,
"model": "togethercomputer/RedPajama-INCITE-7B-Chat",
"n_epochs": 3,
"batch_size": 4,
"learning_rate": 0.00001,
"suffix": "scam_detector_v1"
}
result = together.FineTune.create(**job_params)
return result['id']
def main():
# Initialize trainer with your API key
API_KEY = "ebcbfe89e5c1cdf5851dca154326d4bf3303fa6361032b3973139d6a84a5f247"
trainer = ModelTrainer(api_key=API_KEY)
try:
# Try to mount Google Drive if in Colab
from google.colab import drive
print("Mounting Google Drive...")
drive.mount('/content/drive')
csv_path = '/content/drive/MyDrive/scam4.csv'
except:
# If not in Colab, use local path
csv_path = 'scam4.csv'
try:
# Prepare and upload data files
train_file, val_file = trainer.prepare_data(csv_path)
# Upload files
training_file_id = trainer.upload_file(train_file)
validation_file_id = trainer.upload_file(val_file)
# Start fine-tuning
print("\nStarting fine-tuning job...")
job_id = trainer.create_fine_tuning_job(training_file_id, validation_file_id)
print(f"Fine-tuning job created with ID: {job_id}")
# Clean up temporary files
os.remove(train_file)
os.remove(val_file)
print("\nTemporary files cleaned up")
except Exception as e:
print(f"An error occurred: {str(e)}")
# Print more detailed error information
import traceback
traceback.print_exc()
if __name__ == "__main__":
main()
|