luohoa97 commited on
Commit
40510d6
Β·
verified Β·
1 Parent(s): d0fe7a8

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +134 -0
app.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import torch
4
+ from torch.utils.data import Dataset, DataLoader
5
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
6
+ from sklearn.model_selection import train_test_split
7
+ import os
8
+ import json
9
+
10
+ # Dataset class for PyTorch
11
+ class TextDataset(Dataset):
12
+ def __init__(self, encodings, labels):
13
+ self.encodings = encodings
14
+ self.labels = labels
15
+
16
+ def __getitem__(self, idx):
17
+ # Return input_ids, attention_mask, and labels for each item
18
+ item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
19
+ item['labels'] = torch.tensor(self.labels[idx]) # Adding labels for loss calculation
20
+ return item
21
+
22
+ def __len__(self):
23
+ return len(self.labels)
24
+
25
+ # Function to load configuration
26
+ def load_config(config_path='config.json'):
27
+ with open(config_path, 'r') as f:
28
+ config = json.load(f)
29
+ return config
30
+
31
+ # Main function
32
+ def main():
33
+ st.title("CSV Data Processing and Model Training 🧠")
34
+
35
+ # Load configuration
36
+ config = load_config()
37
+
38
+ # Upload multiple CSV files
39
+ uploaded_files = st.file_uploader("Upload CSV files", accept_multiple_files=True, type="csv")
40
+
41
+ if uploaded_files:
42
+ combined_texts = []
43
+
44
+ # Process each uploaded CSV file
45
+ for uploaded_file in uploaded_files:
46
+ df = pd.read_csv(uploaded_file)
47
+
48
+ # Combine all columns into a single text string for each row
49
+ combined_texts.extend(df.astype(str).agg(' '.join, axis=1))
50
+
51
+ # Check the combined text
52
+ st.write("Combined text for training:", combined_texts[:5]) # Show first 5 for verification
53
+
54
+ # Ask the user if they want to load an existing model or train a new one
55
+ use_existing_model = st.checkbox("Load an existing local model?", value=False)
56
+
57
+ if use_existing_model:
58
+ # Allow the user to select a local model directory
59
+ model_path = st.text_input("Enter the path to the local model directory:", value="")
60
+ if model_path and os.path.exists(model_path):
61
+ model = AutoModelForSequenceClassification.from_pretrained(model_path)
62
+ st.write(f"Loaded model from {model_path} successfully! πŸŽ‰")
63
+ else:
64
+ st.warning("Please provide a valid model directory path.")
65
+ return
66
+ else:
67
+ # Initialize a new model
68
+ model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
69
+
70
+ # Initialize tokenizer
71
+ tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
72
+
73
+ # Tokenize combined text data
74
+ inputs = tokenizer(combined_texts, padding=True, truncation=True, return_tensors="pt", max_length=512)
75
+
76
+ # Create dummy labels (e.g., 0s for all entries)
77
+ labels = [0] * len(combined_texts) # Dummy labels for all data
78
+
79
+ # Split data into training and validation sets
80
+ train_inputs, val_inputs, train_labels, val_labels = train_test_split(
81
+ inputs['input_ids'], labels, test_size=0.2, random_state=42
82
+ )
83
+
84
+ # Prepare datasets
85
+ train_dataset = TextDataset(encodings={'input_ids': train_inputs}, labels=train_labels)
86
+ val_dataset = TextDataset(encodings={'input_ids': val_inputs}, labels=val_labels)
87
+
88
+ # Determine number of threads from config
89
+ num_workers = config.get('num_workers', 4)
90
+
91
+ # Set up DataLoaders
92
+ train_dataloader = DataLoader(train_dataset, batch_size=8, num_workers=num_workers)
93
+ val_dataloader = DataLoader(val_dataset, batch_size=8, num_workers=num_workers)
94
+
95
+ # Training arguments
96
+ training_args = TrainingArguments(
97
+ output_dir='./results', # output directory
98
+ num_train_epochs=1, # total number of training epochs
99
+ per_device_train_batch_size=8, # batch size per device during training
100
+ per_device_eval_batch_size=8, # batch size for evaluation
101
+ warmup_steps=500, # number of warmup steps for learning rate scheduler
102
+ weight_decay=0.01, # strength of weight decay
103
+ logging_dir='./logs', # directory for storing logs
104
+ logging_steps=10,
105
+ evaluation_strategy="epoch"
106
+ )
107
+
108
+ # Initialize Trainer
109
+ trainer = Trainer(
110
+ model=model, # the instantiated πŸ€— Transformers model to be trained
111
+ args=training_args, # training arguments, defined above
112
+ train_dataset=train_dataset, # training dataset
113
+ eval_dataset=val_dataset # evaluation dataset
114
+ )
115
+
116
+ # Start training
117
+ trainer.train()
118
+
119
+ # Ask the user for a directory to save the trained model
120
+ save_path = st.text_input("Enter the directory path to save the trained model:", value="./trained_model")
121
+
122
+ if save_path:
123
+ os.makedirs(save_path, exist_ok=True)
124
+ model.save_pretrained(save_path)
125
+ tokenizer.save_pretrained(save_path)
126
+ st.write(f"Model saved successfully to {save_path}! πŸŽ‰")
127
+ else:
128
+ st.warning("Please provide a valid directory path to save the model.")
129
+
130
+ # Notify user of training completion
131
+ st.success("Training completed successfully! πŸš€")
132
+
133
+ if __name__ == "__main__":
134
+ main()