Spaces:
Runtime error
Runtime error
from datasets import load_dataset | |
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments | |
import gradio as gr | |
from transformers import pipeline | |
import logging | |
# Enable detailed logging | |
logging.basicConfig(level=logging.INFO) | |
# Load dataset | |
dataset = load_dataset("mwitiderrick/swahili") | |
# Print dataset columns for verification | |
print(f"Dataset columns: {dataset['train'].column_names}") | |
# Select a subset of the dataset (e.g., first 100,000 rows) | |
subset_size = 50000 # Adjust the size as needed | |
subset_dataset = dataset["train"].select(range(min(subset_size, len(dataset["train"])))) | |
print(f"Using a subset of {len(subset_dataset)} rows for training.") | |
# Initialize the tokenizer and model | |
model_name = "gpt2" # Use GPT-2 for text generation | |
tokenizer = GPT2Tokenizer.from_pretrained(model_name) | |
model = GPT2LMHeadModel.from_pretrained(model_name) | |
# Add a padding token to the tokenizer | |
tokenizer.pad_token = tokenizer.eos_token # Use eos_token as pad_token | |
tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token) | |
# Preprocess the dataset | |
def preprocess_function(examples): | |
# Tokenize and format the dataset | |
encodings = tokenizer( | |
examples['text'], # Use 'text' column from your dataset | |
truncation=True, | |
padding='max_length', # Ensure consistent length | |
max_length=512 | |
) | |
encodings['labels'] = encodings['input_ids'] # Use input_ids directly as labels | |
return encodings | |
# Tokenize the dataset | |
try: | |
tokenized_datasets = subset_dataset.map( | |
preprocess_function, | |
batched=True | |
) | |
except Exception as e: | |
print(f"Error during tokenization: {e}") | |
# Define training arguments | |
training_args = TrainingArguments( | |
output_dir='./results', | |
per_device_train_batch_size=4, | |
num_train_epochs=1, | |
logging_dir='./logs', | |
logging_steps=500, # Log every 500 steps | |
evaluation_strategy="steps", # Use evaluation strategy | |
save_steps=10_000, # Save checkpoint every 10,000 steps | |
save_total_limit=2, # Keep only the last 2 checkpoints | |
) | |
# Define Trainer | |
trainer = Trainer( | |
model=model, | |
args=training_args, | |
train_dataset=tokenized_datasets, | |
tokenizer=tokenizer, | |
) | |
# Start training | |
try: | |
trainer.train() | |
except Exception as e: | |
print(f"Error during training: {e}") | |
# Define the Gradio interface function | |
nlp = pipeline("text-generation", model=model, tokenizer=tokenizer) | |
def generate_text(prompt): | |
try: | |
return nlp(prompt, max_length=50)[0]['generated_text'] | |
except Exception as e: | |
return f"Error during text generation: {e}" | |
# Create and launch the Gradio interface | |
iface = gr.Interface( | |
fn=generate_text, | |
inputs="text", | |
outputs="text", | |
title="Swahili Language Model", | |
description="Generate text in Swahili using a pre-trained language model." | |
) | |
iface.launch() | |