import shutil import os __import__('pysqlite3') import sys sys.modules['sqlite3'] = sys.modules.pop('pysqlite3') from sentence_transformers import SentenceTransformer import chromadb from datasets import load_dataset import gradio as gr from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments model_name = "Amitesh007/text_generation-finetuned-gpt2" # Load the tokenizer tokenizer = GPT2Tokenizer.from_pretrained(model_name) # Load the model with from_tf=True model = GPT2LMHeadModel.from_pretrained(model_name, from_tf=True) # Function to clear the cache def clear_cache(model_name): cache_dir = os.path.expanduser(f'~/.cache/torch/sentence_transformers/{model_name.replace("/", "_")}') if os.path.exists(cache_dir): shutil.rmtree(cache_dir) print(f"Cleared cache directory: {cache_dir}") else: print(f"No cache directory found for: {cache_dir}") # Embedding vector class VectorStore: def __init__(self, collection_name): # Initialize the embedding model try: self.embedding_model = SentenceTransformer('sentence-transformers/multi-qa-MiniLM-L6-cos-v1') except Exception as e: print(f"Error loading model: {e}") raise self.chroma_client = chromadb.Client() self.collection = self.chroma_client.create_collection(name=collection_name) # Method to populate the vector store with embeddings from a dataset def populate_vectors(self, dataset, batch_size=100): # Use dataset streaming dataset = load_dataset('Thefoodprocessor/recipe_new_with_features_full', split='train[:1500]', streaming=True) texts = [] i = 0 # Initialize index for example in dataset: title = example['title_cleaned'] recipe = example['recipe_new'] meal_type = example['meal_type'] allergy = example['allergy_type'] ingredients_alternative = example['ingredients_alternatives'] # Concatenate the text from the columns text = f"{title} {recipe} {meal_type} {allergy} {ingredients_alternative}" texts.append(text) # Process the batch if (i + 1) % batch_size == 0: self._process_batch(texts, i) texts = [] i += 1 # Increment index # Process the remaining texts if texts: self._process_batch(texts, i) def _process_batch(self, texts, batch_start_idx): embeddings = self.embedding_model.encode(texts, batch_size=len(texts)).tolist() for j, embedding in enumerate(embeddings): self.collection.add(embeddings=[embedding], documents=[texts[j]], ids=[str(batch_start_idx + j)]) def search_context(self, query, n_results=1): query_embeddings = self.embedding_model.encode(query).tolist() return self.collection.query(query_embeddings=query_embeddings, n_results=n_results) # Create a vector embedding vector_store = VectorStore("embedding_vector") vector_store.populate_vectors(dataset=None) # Fine-tuning function def fine_tune_model(): # Load your dataset dataset = load_dataset('Thefoodprocessor/recipe_new_with_features_full', split='train[:1500]', streaming=False) # Prepare the data for training def tokenize_function(examples): return tokenizer(examples['title_cleaned'] + " " + examples['recipe_new'], padding="max_length", truncation=True) tokenized_datasets = dataset.map(tokenize_function, batched=True) # Define training arguments training_args = TrainingArguments( output_dir="./results", evaluation_strategy="epoch", learning_rate=2e-5, per_device_train_batch_size=8, per_device_eval_batch_size=8, num_train_epochs=3, weight_decay=0.01, ) # Initialize Trainer trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_datasets, ) # Train the model trainer.train() # Fine-tune the model fine_tune_model() # Define the chatbot response function conversation_history = [] def chatbot_response(user_input): global conversation_history results = vector_store.search_context(user_input, n_results=1) context = results['documents'][0] if results['documents'] else "" conversation_history.append(f"User: {user_input}\nContext: {context[:150]}\nBot:") inputs = tokenizer("\n".join(conversation_history), return_tensors="pt") outputs = model.generate(**inputs, max_length=150, do_sample=True, temperature=0.7) response = tokenizer.decode(outputs[0], skip_special_tokens=True) conversation_history.append(response) return response # Gradio interface def chat(user_input): response = chatbot_response(user_input) return response css = ".gradio-container {background: url(https://upload.wikimedia.org/wikipedia/commons/f/f5/Spring_Kitchen_Line-Up_%28Unsplash%29.jpg)}" iface = gr.Interface(fn=chat, inputs="text", outputs="text", css=css) iface.launch()