File size: 5,513 Bytes
40510d6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import streamlit as st
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import os
import json

# Dataset class for PyTorch
class TextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        # Return input_ids, attention_mask, and labels for each item
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])  # Adding labels for loss calculation
        return item

    def __len__(self):
        return len(self.labels)

# Function to load configuration
def load_config(config_path='config.json'):
    with open(config_path, 'r') as f:
        config = json.load(f)
    return config

# Main function
def main():
    st.title("CSV Data Processing and Model Training 🧠")

    # Load configuration
    config = load_config()

    # Upload multiple CSV files
    uploaded_files = st.file_uploader("Upload CSV files", accept_multiple_files=True, type="csv")

    if uploaded_files:
        combined_texts = []

        # Process each uploaded CSV file
        for uploaded_file in uploaded_files:
            df = pd.read_csv(uploaded_file)
            
            # Combine all columns into a single text string for each row
            combined_texts.extend(df.astype(str).agg(' '.join, axis=1))

        # Check the combined text
        st.write("Combined text for training:", combined_texts[:5])  # Show first 5 for verification

        # Ask the user if they want to load an existing model or train a new one
        use_existing_model = st.checkbox("Load an existing local model?", value=False)

        if use_existing_model:
            # Allow the user to select a local model directory
            model_path = st.text_input("Enter the path to the local model directory:", value="")
            if model_path and os.path.exists(model_path):
                model = AutoModelForSequenceClassification.from_pretrained(model_path)
                st.write(f"Loaded model from {model_path} successfully! πŸŽ‰")
            else:
                st.warning("Please provide a valid model directory path.")
                return
        else:
            # Initialize a new model
            model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
        
        # Initialize tokenizer
        tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

        # Tokenize combined text data
        inputs = tokenizer(combined_texts, padding=True, truncation=True, return_tensors="pt", max_length=512)

        # Create dummy labels (e.g., 0s for all entries)
        labels = [0] * len(combined_texts)  # Dummy labels for all data

        # Split data into training and validation sets
        train_inputs, val_inputs, train_labels, val_labels = train_test_split(
            inputs['input_ids'], labels, test_size=0.2, random_state=42
        )

        # Prepare datasets
        train_dataset = TextDataset(encodings={'input_ids': train_inputs}, labels=train_labels)
        val_dataset = TextDataset(encodings={'input_ids': val_inputs}, labels=val_labels)

        # Determine number of threads from config
        num_workers = config.get('num_workers', 4)

        # Set up DataLoaders
        train_dataloader = DataLoader(train_dataset, batch_size=8, num_workers=num_workers)
        val_dataloader = DataLoader(val_dataset, batch_size=8, num_workers=num_workers)

        # Training arguments
        training_args = TrainingArguments(
            output_dir='./results',          # output directory
            num_train_epochs=1,              # total number of training epochs
            per_device_train_batch_size=8,   # batch size per device during training
            per_device_eval_batch_size=8,    # batch size for evaluation
            warmup_steps=500,                # number of warmup steps for learning rate scheduler
            weight_decay=0.01,               # strength of weight decay
            logging_dir='./logs',            # directory for storing logs
            logging_steps=10,
            evaluation_strategy="epoch"
        )

        # Initialize Trainer
        trainer = Trainer(
            model=model,                         # the instantiated πŸ€— Transformers model to be trained
            args=training_args,                  # training arguments, defined above
            train_dataset=train_dataset,         # training dataset
            eval_dataset=val_dataset             # evaluation dataset
        )

        # Start training
        trainer.train()

        # Ask the user for a directory to save the trained model
        save_path = st.text_input("Enter the directory path to save the trained model:", value="./trained_model")

        if save_path:
            os.makedirs(save_path, exist_ok=True)
            model.save_pretrained(save_path)
            tokenizer.save_pretrained(save_path)
            st.write(f"Model saved successfully to {save_path}! πŸŽ‰")
        else:
            st.warning("Please provide a valid directory path to save the model.")

        # Notify user of training completion
        st.success("Training completed successfully! πŸš€")

if __name__ == "__main__":
    main()