Spaces:

luohoa97
/

train

Sleeping

File size: 5,513 Bytes

40510d6

import streamlit as st
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import os
import json

# Dataset class for PyTorch
class TextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        # Return input_ids, attention_mask, and labels for each item
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])  # Adding labels for loss calculation
        return item

    def __len__(self):
        return len(self.labels)

# Function to load configuration
def load_config(config_path='config.json'):
    with open(config_path, 'r') as f:
        config = json.load(f)
    return config

# Main function
def main():
    st.title("CSV Data Processing and Model Training 🧠")

    # Load configuration
    config = load_config()

    # Upload multiple CSV files
    uploaded_files = st.file_uploader("Upload CSV files", accept_multiple_files=True, type="csv")

    if uploaded_files:
        combined_texts = []

        # Process each uploaded CSV file
        for uploaded_file in uploaded_files:
            df = pd.read_csv(uploaded_file)
            
            # Combine all columns into a single text string for each row
            combined_texts.extend(df.astype(str).agg(' '.join, axis=1))

        # Check the combined text
        st.write("Combined text for training:", combined_texts[:5])  # Show first 5 for verification

        # Ask the user if they want to load an existing model or train a new one
        use_existing_model = st.checkbox("Load an existing local model?", value=False)

        if use_existing_model:
            # Allow the user to select a local model directory
            model_path = st.text_input("Enter the path to the local model directory:", value="")
            if model_path and os.path.exists(model_path):
                model = AutoModelForSequenceClassification.from_pretrained(model_path)
                st.write(f"Loaded model from {model_path} successfully! 🎉")
            else:
                st.warning("Please provide a valid model directory path.")
                return
        else:
            # Initialize a new model
            model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
        
        # Initialize tokenizer
        tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

        # Tokenize combined text data
        inputs = tokenizer(combined_texts, padding=True, truncation=True, return_tensors="pt", max_length=512)

        # Create dummy labels (e.g., 0s for all entries)
        labels = [0] * len(combined_texts)  # Dummy labels for all data

        # Split data into training and validation sets
        train_inputs, val_inputs, train_labels, val_labels = train_test_split(
            inputs['input_ids'], labels, test_size=0.2, random_state=42
        )

        # Prepare datasets
        train_dataset = TextDataset(encodings={'input_ids': train_inputs}, labels=train_labels)
        val_dataset = TextDataset(encodings={'input_ids': val_inputs}, labels=val_labels)

        # Determine number of threads from config
        num_workers = config.get('num_workers', 4)

        # Set up DataLoaders
        train_dataloader = DataLoader(train_dataset, batch_size=8, num_workers=num_workers)
        val_dataloader = DataLoader(val_dataset, batch_size=8, num_workers=num_workers)

        # Training arguments
        training_args = TrainingArguments(
            output_dir='./results',          # output directory
            num_train_epochs=1,              # total number of training epochs
            per_device_train_batch_size=8,   # batch size per device during training
            per_device_eval_batch_size=8,    # batch size for evaluation
            warmup_steps=500,                # number of warmup steps for learning rate scheduler
            weight_decay=0.01,               # strength of weight decay
            logging_dir='./logs',            # directory for storing logs
            logging_steps=10,
            evaluation_strategy="epoch"
        )

        # Initialize Trainer
        trainer = Trainer(
            model=model,                         # the instantiated 🤗 Transformers model to be trained
            args=training_args,                  # training arguments, defined above
            train_dataset=train_dataset,         # training dataset
            eval_dataset=val_dataset             # evaluation dataset
        )

        # Start training
        trainer.train()

        # Ask the user for a directory to save the trained model
        save_path = st.text_input("Enter the directory path to save the trained model:", value="./trained_model")

        if save_path:
            os.makedirs(save_path, exist_ok=True)
            model.save_pretrained(save_path)
            tokenizer.save_pretrained(save_path)
            st.write(f"Model saved successfully to {save_path}! 🎉")
        else:
            st.warning("Please provide a valid directory path to save the model.")

        # Notify user of training completion
        st.success("Training completed successfully! 🚀")

if __name__ == "__main__":
    main()