Ankitajadhav commited on
Commit
a72e07a
·
verified ·
1 Parent(s): 91b2664

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +31 -43
app.py CHANGED
@@ -8,21 +8,25 @@ import chromadb
8
  from datasets import load_dataset
9
  import gradio as gr
10
  import torch
 
11
 
 
 
12
 
 
 
 
13
 
14
- # Load model directly
15
- from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
16
- os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
17
- torch.random.manual_seed(0)
18
- model = AutoModelForCausalLM.from_pretrained(
19
- "microsoft/Phi-3-mini-4k-instruct",
20
- low_cpu_mem_usage=True,
21
- torch_dtype="auto",
22
- trust_remote_code=True,
23
- )
24
 
25
- tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
 
 
 
 
 
 
26
 
27
  # Function to clear the cache
28
  def clear_cache(model_name):
@@ -36,7 +40,6 @@ def clear_cache(model_name):
36
  # Embedding vector
37
  class VectorStore:
38
  def __init__(self, collection_name):
39
- # Initialize the embedding model
40
  try:
41
  self.embedding_model = SentenceTransformer('sentence-transformers/multi-qa-MiniLM-L6-cos-v1')
42
  except Exception as e:
@@ -45,33 +48,24 @@ class VectorStore:
45
  self.chroma_client = chromadb.Client()
46
  self.collection = self.chroma_client.create_collection(name=collection_name)
47
 
48
- # Method to populate the vector store with embeddings from a dataset
49
- def populate_vectors(self, dataset, batch_size=10):
50
- # Use dataset streaming
51
- #dataset = load_dataset('Thefoodprocessor/recipe_new_with_features_full', split='train[:1500]', streaming=True)
52
  dataset = load_dataset('Thefoodprocessor/recipe_new_with_features_full', split='train')
53
- dataset = dataset.select(range(50)) # Select the first 1500 examples
54
 
55
  texts = []
56
- i = 0 # Initialize index
57
  for example in dataset:
58
  title = example['title_cleaned']
59
  recipe = example['recipe_new']
 
60
  allergy = example['allergy_type']
61
- # ingredients_alternative = example['ingredients_alternatives']
62
-
63
- # Concatenate the text from the columns
64
- text = f"{title} {recipe} {allergy}"
65
  texts.append(text)
66
-
67
- # Process the batch
68
  if (i + 1) % batch_size == 0:
69
  self._process_batch(texts, i)
70
  texts = []
71
-
72
- i += 1 # Increment index
73
-
74
- # Process the remaining texts
75
  if texts:
76
  self._process_batch(texts, i)
77
 
@@ -84,47 +78,42 @@ class VectorStore:
84
  query_embeddings = self.embedding_model.encode(query).tolist()
85
  return self.collection.query(query_embeddings=query_embeddings, n_results=n_results)
86
 
87
- # Create a vector embedding
88
  vector_store = VectorStore("embedding_vector")
89
  vector_store.populate_vectors(dataset=None)
90
 
91
- # Fine-tuning function
92
  def fine_tune_model():
93
- # Load your dataset
94
  dataset = load_dataset('Thefoodprocessor/recipe_new_with_features_full', split='train')
95
- dataset = dataset.select(range(50)) # Select the first 1500 examples
96
 
97
- # Prepare the data for training
98
  def tokenize_function(examples):
99
- return tokenizer([" ".join([title, recipe]) for title, recipe in zip(examples['title_cleaned'], examples['recipe_new'])], padding="max_length", truncation=True)
 
 
 
 
100
 
101
- tokenized_datasets = dataset.map(tokenize_function, batched=True)
102
 
103
- # Define training arguments
104
  training_args = TrainingArguments(
105
  output_dir="./results",
106
  evaluation_strategy="epoch",
107
  learning_rate=2e-5,
108
- per_device_train_batch_size=8,
109
- per_device_eval_batch_size=8,
110
  num_train_epochs=3,
111
  weight_decay=0.01,
112
  )
113
 
114
- # Initialize Trainer
115
  trainer = Trainer(
116
  model=model,
117
  args=training_args,
118
  train_dataset=tokenized_datasets,
119
  )
120
 
121
- # Train the model
122
  trainer.train()
123
 
124
- # Fine-tune the model
125
  fine_tune_model()
126
 
127
- # Define the chatbot response function
128
  conversation_history = []
129
 
130
  def chatbot_response(user_input):
@@ -138,7 +127,6 @@ def chatbot_response(user_input):
138
  conversation_history.append(response)
139
  return response
140
 
141
- # Gradio interface
142
  def chat(user_input):
143
  response = chatbot_response(user_input)
144
  return response
 
8
  from datasets import load_dataset
9
  import gradio as gr
10
  import torch
11
+ from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
12
 
13
+ # Set environment variables to address warnings
14
+ os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
15
 
16
+ # Ensure necessary packages are installed
17
+ !pip install accelerate
18
+ !pip install flash-attention
19
 
20
+ torch.random.manual_seed(0)
21
+ model_name = "microsoft/Phi-3-mini-4k-instruct"
 
 
 
 
 
 
 
 
22
 
23
+ model = AutoModelForCausalLM.from_pretrained(
24
+ model_name,
25
+ low_cpu_mem_usage=True,
26
+ torch_dtype="auto",
27
+ trust_remote_code=True,
28
+ )
29
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
30
 
31
  # Function to clear the cache
32
  def clear_cache(model_name):
 
40
  # Embedding vector
41
  class VectorStore:
42
  def __init__(self, collection_name):
 
43
  try:
44
  self.embedding_model = SentenceTransformer('sentence-transformers/multi-qa-MiniLM-L6-cos-v1')
45
  except Exception as e:
 
48
  self.chroma_client = chromadb.Client()
49
  self.collection = self.chroma_client.create_collection(name=collection_name)
50
 
51
+ def populate_vectors(self, dataset, batch_size=20):
 
 
 
52
  dataset = load_dataset('Thefoodprocessor/recipe_new_with_features_full', split='train')
53
+ dataset = dataset.select(range(1500))
54
 
55
  texts = []
56
+ i = 0
57
  for example in dataset:
58
  title = example['title_cleaned']
59
  recipe = example['recipe_new']
60
+ meal_type = example['meal_type']
61
  allergy = example['allergy_type']
62
+ ingredients_alternative = example['ingredients_alternatives']
63
+ text = f"{title} {recipe} {meal_type} {allergy} {ingredients_alternative}"
 
 
64
  texts.append(text)
 
 
65
  if (i + 1) % batch_size == 0:
66
  self._process_batch(texts, i)
67
  texts = []
68
+ i += 1
 
 
 
69
  if texts:
70
  self._process_batch(texts, i)
71
 
 
78
  query_embeddings = self.embedding_model.encode(query).tolist()
79
  return self.collection.query(query_embeddings=query_embeddings, n_results=n_results)
80
 
 
81
  vector_store = VectorStore("embedding_vector")
82
  vector_store.populate_vectors(dataset=None)
83
 
 
84
  def fine_tune_model():
 
85
  dataset = load_dataset('Thefoodprocessor/recipe_new_with_features_full', split='train')
86
+ dataset = dataset.select(range(1500))
87
 
 
88
  def tokenize_function(examples):
89
+ return tokenizer(
90
+ [" ".join([title, recipe]) for title, recipe in zip(examples['title_cleaned'], examples['recipe_new'])],
91
+ padding="max_length",
92
+ truncation=True
93
+ )
94
 
95
+ tokenized_datasets = dataset.map(tokenize_function, batched=True, batch_size=8)
96
 
 
97
  training_args = TrainingArguments(
98
  output_dir="./results",
99
  evaluation_strategy="epoch",
100
  learning_rate=2e-5,
101
+ per_device_train_batch_size=4,
102
+ per_device_eval_batch_size=4,
103
  num_train_epochs=3,
104
  weight_decay=0.01,
105
  )
106
 
 
107
  trainer = Trainer(
108
  model=model,
109
  args=training_args,
110
  train_dataset=tokenized_datasets,
111
  )
112
 
 
113
  trainer.train()
114
 
 
115
  fine_tune_model()
116
 
 
117
  conversation_history = []
118
 
119
  def chatbot_response(user_input):
 
127
  conversation_history.append(response)
128
  return response
129
 
 
130
  def chat(user_input):
131
  response = chatbot_response(user_input)
132
  return response