File size: 5,513 Bytes
40510d6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 |
import streamlit as st
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import os
import json
# Dataset class for PyTorch
class TextDataset(Dataset):
def __init__(self, encodings, labels):
self.encodings = encodings
self.labels = labels
def __getitem__(self, idx):
# Return input_ids, attention_mask, and labels for each item
item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
item['labels'] = torch.tensor(self.labels[idx]) # Adding labels for loss calculation
return item
def __len__(self):
return len(self.labels)
# Function to load configuration
def load_config(config_path='config.json'):
with open(config_path, 'r') as f:
config = json.load(f)
return config
# Main function
def main():
st.title("CSV Data Processing and Model Training π§ ")
# Load configuration
config = load_config()
# Upload multiple CSV files
uploaded_files = st.file_uploader("Upload CSV files", accept_multiple_files=True, type="csv")
if uploaded_files:
combined_texts = []
# Process each uploaded CSV file
for uploaded_file in uploaded_files:
df = pd.read_csv(uploaded_file)
# Combine all columns into a single text string for each row
combined_texts.extend(df.astype(str).agg(' '.join, axis=1))
# Check the combined text
st.write("Combined text for training:", combined_texts[:5]) # Show first 5 for verification
# Ask the user if they want to load an existing model or train a new one
use_existing_model = st.checkbox("Load an existing local model?", value=False)
if use_existing_model:
# Allow the user to select a local model directory
model_path = st.text_input("Enter the path to the local model directory:", value="")
if model_path and os.path.exists(model_path):
model = AutoModelForSequenceClassification.from_pretrained(model_path)
st.write(f"Loaded model from {model_path} successfully! π")
else:
st.warning("Please provide a valid model directory path.")
return
else:
# Initialize a new model
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
# Tokenize combined text data
inputs = tokenizer(combined_texts, padding=True, truncation=True, return_tensors="pt", max_length=512)
# Create dummy labels (e.g., 0s for all entries)
labels = [0] * len(combined_texts) # Dummy labels for all data
# Split data into training and validation sets
train_inputs, val_inputs, train_labels, val_labels = train_test_split(
inputs['input_ids'], labels, test_size=0.2, random_state=42
)
# Prepare datasets
train_dataset = TextDataset(encodings={'input_ids': train_inputs}, labels=train_labels)
val_dataset = TextDataset(encodings={'input_ids': val_inputs}, labels=val_labels)
# Determine number of threads from config
num_workers = config.get('num_workers', 4)
# Set up DataLoaders
train_dataloader = DataLoader(train_dataset, batch_size=8, num_workers=num_workers)
val_dataloader = DataLoader(val_dataset, batch_size=8, num_workers=num_workers)
# Training arguments
training_args = TrainingArguments(
output_dir='./results', # output directory
num_train_epochs=1, # total number of training epochs
per_device_train_batch_size=8, # batch size per device during training
per_device_eval_batch_size=8, # batch size for evaluation
warmup_steps=500, # number of warmup steps for learning rate scheduler
weight_decay=0.01, # strength of weight decay
logging_dir='./logs', # directory for storing logs
logging_steps=10,
evaluation_strategy="epoch"
)
# Initialize Trainer
trainer = Trainer(
model=model, # the instantiated π€ Transformers model to be trained
args=training_args, # training arguments, defined above
train_dataset=train_dataset, # training dataset
eval_dataset=val_dataset # evaluation dataset
)
# Start training
trainer.train()
# Ask the user for a directory to save the trained model
save_path = st.text_input("Enter the directory path to save the trained model:", value="./trained_model")
if save_path:
os.makedirs(save_path, exist_ok=True)
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)
st.write(f"Model saved successfully to {save_path}! π")
else:
st.warning("Please provide a valid directory path to save the model.")
# Notify user of training completion
st.success("Training completed successfully! π")
if __name__ == "__main__":
main()
|