# import packages __import__('pysqlite3') import sys sys.modules['sqlite3'] = sys.modules.pop('pysqlite3') from sentence_transformers import SentenceTransformer import chromadb from datasets import load_dataset from transformers import AutoModelForCausalLM, AutoTokenizer import gradio as gr import faiss # Embedding vector class VectorStore: def __init__(self, collection_name): # Initialize the embedding model self.embedding_model = SentenceTransformer('sentence-transformers/multi-qa-MiniLM-L6-cos-v1') self.chroma_client = chromadb.Client() self.collection = self.chroma_client.create_collection(name=collection_name) # Method to populate the vector store with embeddings from a dataset def populate_vectors(self, dataset): # Select the text columns to concatenate title = dataset['train']['title_cleaned'][:5000] # Limiting to 100 examples for the demo recipe = dataset['train']['recipe_new'][:5000] meal_type = dataset['train']['meal_type'][:5000] allergy = dataset['train']['allergy_type'][:5000] ingredients_alternative = dataset['train']['ingredients_alternatives'][:5000] # Concatenate the text from both columns texts = [f"{tit} {rep} {meal} {alle} {ingr} " for tit, rep, meal,alle, ingr in zip(title,recipe,meal_type,allergy,ingredients_alternative)] for i, item in enumerate(texts): embeddings = self.embedding_model.encode(item).tolist() self.collection.add(embeddings=[embeddings], documents=[item], ids=[str(i)]) # # Method to search the ChromaDB collection for relevant context based on a query def search_context(self, query, n_results=1): query_embeddings = self.embedding_model.encode(query).tolist() return self.collection.query(query_embeddings=query_embeddings, n_results=n_results) # importing dataset hosted on huggingface # dataset details - https://huggingface.co/datasets/Thefoodprocessor/recipe_new_with_features_full dataset = load_dataset('Thefoodprocessor/recipe_new_with_features_full') # create a vector embedding vector_store = VectorStore("embedding_vector") vector_store.populate_vectors(dataset) # Load the model and tokenizer # text generation model model_name = "meta-llama/Meta-Llama-3-8B" tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForCausalLM.from_pretrained(model_name) # Define the chatbot response function def chatbot_response(user_input): global conversation_history results = vector_store.search_context(user_input, n_results=1) context = results['documents'][0] if results['documents'] else "" conversation_history.append(f"User: {user_input}\nContext: {context[:150]}\nBot:") inputs = tokenizer("\n".join(conversation_history), return_tensors="pt") outputs = model.generate(**inputs, max_length=150, do_sample=True, temperature=0.7) response = tokenizer.decode(outputs[0], skip_special_tokens=True) conversation_history.append(response) return response # Gradio interface def chat(user_input): response = chatbot_response(user_input) return response iface = gr.Interface(fn=chat, inputs="text", outputs="text") iface.launch()