Spaces:

Sephfox
/

E.L.N

Sleeping

App Files Files Community

diabolic6045 commited on Oct 22, 2024

Commit

23d3c60

verified ·

1 Parent(s): 8108179

Upload 2 files

Browse files

GPT-2 with GA added

Files changed (2) hide show

app.py +242 -105
requirements.txt +10 -4

app.py CHANGED Viewed

@@ -1,105 +1,242 @@
-import streamlit as st
-import numpy as np
-import tensorflow as tf
-from tensorflow import keras
-from sklearn.datasets import make_classification
-from sklearn.model_selection import train_test_split
-from sklearn.metrics import accuracy_score
-import random
-# Define a function to generate a dataset
-def generate_dataset(task_id):
-    X, y = make_classification(n_samples=100, n_features=10, n_informative=5, n_redundant=3, n_repeated=2, random_state=task_id)
-    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=task_id)
-    return X_train, X_test, y_train, y_test
-# Define a neural network class
-class Net(keras.Model):
-    def __init__(self):
-        super(Net, self).__init__()
-        self.fc1 = keras.layers.Dense(20, activation='relu', input_shape=(10,))
-        self.fc2 = keras.layers.Dense(10, activation='relu')
-        self.fc3 = keras.layers.Dense(2)
-    def call(self, x):
-        x = self.fc1(x)
-        x = self.fc2(x)
-        x = self.fc3(x)
-        return x
-# Define a genetic algorithm class
-class GeneticAlgorithm:
-    def __init__(self, population_size):
-        self.population_size = population_size
-        self.population = [Net() for _ in range(population_size)]
-    def selection(self, task_id):
-        X_train, X_test, y_train, y_test = generate_dataset(task_id)
-        fitness = []
-        for net in self.population:
-            net.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
-            net.fit(X_train, y_train, epochs=10, verbose=0)
-            loss, accuracy = net.evaluate(X_test, y_test, verbose=0)
-            fitness.append(accuracy)
-        self.population = [self.population[i] for i in np.argsort(fitness)[-self.population_size//2:]]
-    def crossover(self):
-        offspring = []
-        X = np.random.rand(1, 10)  # dummy input to build the layers
-        for _ in range(self.population_size//2):
-            parent1, parent2 = random.sample(self.population, 2)
-            child = Net()
-            child(X)  # build the layers
-            parent1(X)  # build the layers
-            parent2(X)  # build the layers
-            # Average the weights of the two parents
-            parent1_weights = parent1.get_weights()
-            parent2_weights = parent2.get_weights()
-            child_weights = [(np.array(w1) + np.array(w2)) / 2 for w1, w2 in zip(parent1_weights, parent2_weights)]
-            child.set_weights(child_weights)
-            offspring.append(child)
-        self.population += offspring
-    def mutation(self):
-        X = np.random.rand(1, 10)  # dummy input to build the layers
-        for net in self.population:
-            net(X)  # build the layers
-            if random.random() < 0.1:
-                weights = net.get_weights()
-                new_weights = [np.array(w) + np.random.randn(*w.shape) * 0.1 for w in weights]
-                net.set_weights(new_weights)
-# Streamlit app
-st.title("Evolution of Sub-Models")
-# Parameters
-st.sidebar.header("Parameters")
-population_size = st.sidebar.slider("Population size", 10, 100, 50)
-num_tasks = st.sidebar.slider("Number of tasks", 1, 10, 5)
-num_generations = st.sidebar.slider("Number of generations", 1, 100, 10)
-# Run the evolution
-if st.button("Run evolution"):
-    ga = GeneticAlgorithm(population_size)
-    for generation in range(num_generations):
-        for task_id in range(num_tasks):
-            ga.selection(task_id)
-            ga.crossover()
-            ga.mutation()
-        st.write(f"Generation {generation+1} complete")
-    # Evaluate the final population
-    final_accuracy = []
-    for task_id in range(num_tasks):
-        X_train, X_test, y_train, y_test = generate_dataset(task_id)
-        accuracy = []
-        for net in ga.population:
-            net.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
-            net.fit(X_train, y_train, epochs=10, verbose=0)
-            loss, acc = net.evaluate(X_test, y_test, verbose=0)
-            accuracy.append(acc)
-        final_accuracy.append(np.mean(accuracy))
-    st.write(f"Final accuracy: {np.mean(final_accuracy)}")

+import streamlit as st
+import numpy as np
+import random
+import torch
+import transformers
+from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
+from datasets import Dataset
+import os
+# Set random seeds for reproducibility
+random.seed(42)
+np.random.seed(42)
+torch.manual_seed(42)
+def generate_demo_data(num_samples=60):
+    # Generate meaningful sentences on various topics
+    subjects = [
+        'Artificial intelligence', 'Climate change', 'Renewable energy',
+        'Space exploration', 'Quantum computing', 'Genetic engineering',
+        'Blockchain technology', 'Virtual reality', 'Cybersecurity',
+        'Biotechnology', 'Nanotechnology', 'Astrophysics'
+    ]
+    verbs = [
+        'is transforming', 'is influencing', 'is revolutionizing',
+        'is challenging', 'is advancing', 'is reshaping', 'is impacting',
+        'is enhancing', 'is disrupting', 'is redefining'
+    ]
+    objects = [
+        'modern science', 'global economies', 'healthcare systems',
+        'communication methods', 'educational approaches',
+        'environmental policies', 'social interactions', 'the job market',
+        'data security', 'the entertainment industry'
+    ]
+    data = []
+    for i in range(num_samples):
+        subject = random.choice(subjects)
+        verb = random.choice(verbs)
+        obj = random.choice(objects)
+        sentence = f"{subject} {verb} {obj}."
+        data.append(sentence)
+    return data
+def load_data(uploaded_file):
+    # Load user-uploaded text file
+    data = uploaded_file.read().decode("utf-8")
+    data = data.splitlines()
+    return data
+def prepare_dataset(data, tokenizer, block_size=128):
+    # Tokenize the texts
+    def tokenize_function(examples):
+        return tokenizer(examples['text'], truncation=True, max_length=block_size, padding='max_length')
+    raw_dataset = Dataset.from_dict({'text': data})
+    tokenized_dataset = raw_dataset.map(tokenize_function, batched=True, remove_columns=['text'])
+    # Create labels for language modeling
+    tokenized_dataset = tokenized_dataset.map(
+        lambda examples: {'labels': examples['input_ids']},
+        batched=True
+    )
+    # Set the format for PyTorch
+    tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
+    return tokenized_dataset
+def fitness_function(individual, train_dataset, model, tokenizer):
+    # Define the training arguments
+    training_args = TrainingArguments(
+        output_dir='./results',
+        overwrite_output_dir=True,
+        num_train_epochs=individual['epochs'],
+        per_device_train_batch_size=individual['batch_size'],
+        learning_rate=individual['learning_rate'],
+        logging_steps=10,
+        save_steps=10,
+        save_total_limit=2,
+        report_to='none',  # Disable logging to Wandb or other services
+    )
+    data_collator = DataCollatorForLanguageModeling(
+        tokenizer=tokenizer, mlm=False
+    )
+    # Train the model
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        data_collator=data_collator,
+        train_dataset=train_dataset,
+        eval_dataset=None,
+    )
+    trainer.train()
+    # For simplicity, use final training loss as fitness score
+    logs = [log for log in trainer.state.log_history if 'loss' in log]
+    if logs:
+        loss = logs[-1]['loss']
+    else:
+        loss = float('inf')
+    return loss
+# Genetic Algorithm Functions
+def create_population(size, param_bounds):
+    population = []
+    for _ in range(size):
+        individual = {
+            'learning_rate': random.uniform(*param_bounds['learning_rate']),
+            'epochs': random.randint(*param_bounds['epochs']),
+            'batch_size': random.choice(param_bounds['batch_size']),
+        }
+        population.append(individual)
+    return population
+def select_mating_pool(population, fitnesses, num_parents):
+    parents = [population[i] for i in np.argsort(fitnesses)[:num_parents]]
+    return parents
+def crossover(parents, offspring_size):
+    offspring = []
+    for _ in range(offspring_size):
+        parent1 = random.choice(parents)
+        parent2 = random.choice(parents)
+        child = {
+            'learning_rate': random.choice([parent1['learning_rate'], parent2['learning_rate']]),
+            'epochs': random.choice([parent1['epochs'], parent2['epochs']]),
+            'batch_size': random.choice([parent1['batch_size'], parent2['batch_size']]),
+        }
+        offspring.append(child)
+    return offspring
+def mutation(offspring, param_bounds, mutation_rate=0.1):
+    for individual in offspring:
+        if random.random() < mutation_rate:
+            individual['learning_rate'] = random.uniform(*param_bounds['learning_rate'])
+        if random.random() < mutation_rate:
+            individual['epochs'] = random.randint(*param_bounds['epochs'])
+        if random.random() < mutation_rate:
+            individual['batch_size'] = random.choice(param_bounds['batch_size'])
+    return offspring
+# Streamlit App
+def main():
+    st.title("GPT-2 Fine-Tuning with Genetic Algorithm")
+    option = st.sidebar.selectbox(
+        'Choose Data Source',
+        ('DEMO', 'Upload Text File')
+    )
+    if option == 'DEMO':
+        st.write("Using DEMO data...")
+        data = generate_demo_data()
+    else:
+        st.write("Upload a text file for fine-tuning.")
+        uploaded_file = st.file_uploader("Choose a text file", type="txt")
+        if uploaded_file is not None:
+            data = load_data(uploaded_file)
+        else:
+            st.warning("Please upload a text file.")
+            st.stop()
+    # Load tokenizer and model
+    st.write("Loading GPT-2 tokenizer and model...")
+    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+    model = GPT2LMHeadModel.from_pretrained('gpt2')
+    model.to('cuda' if torch.cuda.is_available() else 'cpu')
+    # Set the pad token
+    tokenizer.pad_token = tokenizer.eos_token
+    model.config.pad_token_id = model.config.eos_token_id
+    # Prepare dataset
+    st.write("Preparing dataset...")
+    train_dataset = prepare_dataset(data, tokenizer)
+    # GA Parameters
+    st.sidebar.subheader("Genetic Algorithm Parameters")
+    population_size = st.sidebar.number_input("Population Size", 4, 20, 6)
+    num_generations = st.sidebar.number_input("Number of Generations", 1, 10, 3)
+    num_parents = st.sidebar.number_input("Number of Parents", 2, population_size, 2)
+    mutation_rate = st.sidebar.slider("Mutation Rate", 0.0, 1.0, 0.1)
+    # Hyperparameter bounds
+    param_bounds = {
+        'learning_rate': (1e-5, 5e-5),
+        'epochs': (1, 3),
+        'batch_size': [2, 4, 8]
+    }
+    if st.button("Start Training"):
+        st.write("Initializing Genetic Algorithm...")
+        population = create_population(population_size, param_bounds)
+        best_individual = None
+        best_fitness = float('inf')
+        fitness_history = []
+        progress_bar = st.progress(0)
+        status_text = st.empty()
+        total_evaluations = num_generations * len(population)
+        current_evaluation = 0
+        for generation in range(num_generations):
+            st.write(f"Generation {generation+1}/{num_generations}")
+            fitnesses = []
+            for idx, individual in enumerate(population):
+                status_text.text(f"Evaluating individual {idx+1}/{len(population)} in generation {generation+1}")
+                # Clone the model to avoid reusing the same model
+                model_clone = GPT2LMHeadModel.from_pretrained('gpt2')
+                model_clone.to('cuda' if torch.cuda.is_available() else 'cpu')
+                fitness = fitness_function(individual, train_dataset, model_clone, tokenizer)
+                fitnesses.append(fitness)
+                if fitness < best_fitness:
+                    best_fitness = fitness
+                    best_individual = individual
+                current_evaluation += 1
+                progress_bar.progress(current_evaluation / total_evaluations)
+            fitness_history.append(min(fitnesses))
+            parents = select_mating_pool(population, fitnesses, num_parents)
+            offspring_size = population_size - num_parents
+            offspring = crossover(parents, offspring_size)
+            offspring = mutation(offspring, param_bounds, mutation_rate)
+            population = parents + offspring
+        st.write("Training completed!")
+        st.write(f"Best Hyperparameters: {best_individual}")
+        st.write(f"Best Fitness (Loss): {best_fitness}")
+        # Plot fitness history
+        st.line_chart(fitness_history)
+        # Save the best model
+        if st.button("Save Model"):
+            model_clone.save_pretrained('./fine_tuned_model')
+            tokenizer.save_pretrained('./fine_tuned_model')
+            st.write("Model saved successfully!")
+if __name__ == "__main__":
+    main()

requirements.txt CHANGED Viewed

@@ -1,4 +1,10 @@
-streamlit
-numpy
-tensorflow
-scikit-learn

+streamlit
+numpy
+tensorflow
+scikit-learn
+transformers
+torch
+accelerate
+datasets
+tf-keras