Aniruddha21 commited on
Commit
e7947f0
1 Parent(s): 8751bf3

Delete fine_tuned_llama_2_for_comment_analysis.py

Browse files
fine_tuned_llama_2_for_comment_analysis.py DELETED
@@ -1,508 +0,0 @@
1
- # -*- coding: utf-8 -*-
2
- """Fine Tuned Llama 2 for Comment Analysis
3
-
4
- Automatically generated by Colaboratory.
5
-
6
- Original file is located at
7
- https://colab.research.google.com/drive/1NX5z-wVpsEp8UigB0q7vZSZMFRa6nnEE
8
-
9
- ##**Extract Youtube Comments**
10
- """
11
-
12
- # !pip uninstall gradio
13
- # !pip3 install gradio -q
14
- # !pip install --upgrade fastapi -q
15
- # !pip install typing-extensions --upgrade
16
-
17
- # import locale
18
- # locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')
19
-
20
- # import locale
21
- # locale.getpreferredencoding = lambda: "UTF-8"
22
-
23
- # !pip3 install typing-extensions==4.2.0
24
- # !pip3 install gradio -q
25
- # !pip3 install --upgrade tensorflow
26
-
27
- import pandas as pd
28
- import gradio as gr
29
- from googleapiclient.discovery import build
30
- import csv
31
- # import gradio as gr
32
- from PIL import Image
33
- import io
34
-
35
- api_key = 'AIzaSyANfQYiumNUfJ8_YaDg-Hfr0BRXFhXnbvQ'
36
-
37
- def video_comments(video_id):
38
- # Create a CSV file to store comments
39
- with open('comments.csv', 'w', newline='', encoding='utf-8') as csvfile:
40
- fieldnames = ['Comment']
41
- writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
42
- writer.writeheader()
43
-
44
- # Counter to limit the number of comments
45
- comment_count = 0
46
-
47
- # creating youtube resource object
48
- youtube = build('youtube', 'v3', developerKey=api_key)
49
-
50
- # retrieve youtube video results
51
- video_response = youtube.commentThreads().list(
52
- part='snippet,replies',
53
- videoId=video_id,
54
- maxResults=100 # Adjust the number of comments per page as needed
55
- ).execute()
56
-
57
- # iterate video response
58
- while video_response:
59
-
60
- # extracting required info from each result object
61
- for item in video_response['items']:
62
-
63
- # Extracting comments
64
- comment = item['snippet']['topLevelComment']['snippet']['textDisplay']
65
-
66
- # Write the comment to the CSV file
67
- writer.writerow({'Comment': comment})
68
-
69
- comment_count += 1
70
-
71
- # Check if the maximum comment count is reached
72
- if comment_count >= 50:
73
- return
74
-
75
- # Again repeat
76
- if 'nextPageToken' in video_response:
77
- video_response = youtube.commentThreads().list(
78
- part='snippet,replies',
79
- videoId=video_id,
80
- pageToken=video_response['nextPageToken'],
81
- maxResults=100 # Adjust the number of comments per page as needed
82
- ).execute()
83
- else:
84
- break
85
-
86
- def execution_function(input):
87
- # Initialize a counter for deleted rows
88
- deleted_row_count = 0
89
-
90
- video_comments(input)
91
-
92
- # calling the comment file created above
93
- file_path = "/content/comments.csv"
94
- df = pd.read_csv(file_path)
95
-
96
- # Rename the column name to 'comments'
97
- df.rename(columns={'Comment': 'comments'}, inplace=True)
98
-
99
- # Get the first 300 comments for quick analysis
100
- df = df.head(10)
101
-
102
- return df
103
- # return_distribution()
104
-
105
- # comments_df = execution_function("6ydFDwv-n8w")
106
- # comments_df = comments_df.head(20)
107
-
108
- # comments_df.head()
109
-
110
- """##**Fine - tune Llama 2**
111
-
112
- IMP: This notebook runs on a T4 GPU.
113
- """
114
-
115
- # !pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7
116
-
117
- import os
118
- import torch
119
- from datasets import load_dataset
120
- from transformers import (
121
- AutoModelForCausalLM,
122
- AutoTokenizer,
123
- BitsAndBytesConfig,
124
- HfArgumentParser,
125
- TrainingArguments,
126
- pipeline,
127
- logging,
128
- )
129
- from peft import LoraConfig, PeftModel
130
- from trl import SFTTrainer
131
-
132
- # The model that you want to train from the Hugging Face hub
133
- model_name = "NousResearch/Llama-2-7b-chat-hf"
134
-
135
- # The instruction dataset to use
136
- # dataset_name = "mlabonne/guanaco-llama2-1k"
137
-
138
- # Fine-tuned model name
139
- # new_model = "llama-2-7b-miniguanaco"
140
-
141
- ################################################################################
142
- # QLoRA parameters
143
- ################################################################################
144
-
145
- # LoRA attention dimension
146
- lora_r = 64
147
-
148
- # Alpha parameter for LoRA scaling
149
- lora_alpha = 16
150
-
151
- # Dropout probability for LoRA layers
152
- lora_dropout = 0.1
153
-
154
- ################################################################################
155
- # bitsandbytes parameters
156
- ################################################################################
157
-
158
- # Activate 4-bit precision base model loading
159
- use_4bit = True
160
-
161
- # Compute dtype for 4-bit base models
162
- bnb_4bit_compute_dtype = "float16"
163
-
164
- # Quantization type (fp4 or nf4)
165
- bnb_4bit_quant_type = "nf4"
166
-
167
- # Activate nested quantization for 4-bit base models (double quantization)
168
- use_nested_quant = False
169
-
170
- ################################################################################
171
- # TrainingArguments parameters
172
- ################################################################################
173
-
174
- # Output directory where the model predictions and checkpoints will be stored
175
- output_dir = "./results"
176
-
177
- # Number of training epochs
178
- num_train_epochs = 1
179
-
180
- # Enable fp16/bf16 training (set bf16 to True with an A100)
181
- fp16 = False
182
- bf16 = False
183
-
184
- # Batch size per GPU for training
185
- per_device_train_batch_size = 4
186
-
187
- # Batch size per GPU for evaluation
188
- per_device_eval_batch_size = 4
189
-
190
- # Number of update steps to accumulate the gradients for
191
- gradient_accumulation_steps = 1
192
-
193
- # Enable gradient checkpointing
194
- gradient_checkpointing = True
195
-
196
- # Maximum gradient normal (gradient clipping)
197
- max_grad_norm = 0.3
198
-
199
- # Initial learning rate (AdamW optimizer)
200
- learning_rate = 2e-4
201
-
202
- # Weight decay to apply to all layers except bias/LayerNorm weights
203
- weight_decay = 0.001
204
-
205
- # Optimizer to use
206
- optim = "paged_adamw_32bit"
207
-
208
- # Learning rate schedule
209
- lr_scheduler_type = "cosine"
210
-
211
- # Number of training steps (overrides num_train_epochs)
212
- max_steps = -1
213
-
214
- # Ratio of steps for a linear warmup (from 0 to learning rate)
215
- warmup_ratio = 0.03
216
-
217
- # Group sequences into batches with same length
218
- # Saves memory and speeds up training considerably
219
- group_by_length = True
220
-
221
- # Save checkpoint every X updates steps
222
- save_steps = 0
223
-
224
- # Log every X updates steps
225
- logging_steps = 25
226
-
227
- ################################################################################
228
- # SFT parameters
229
- ################################################################################
230
-
231
- # Maximum sequence length to use
232
- max_seq_length = None
233
-
234
- # Pack multiple short examples in the same input sequence to increase efficiency
235
- packing = False
236
-
237
- # Load the entire model on the GPU 0
238
- device_map = {"": 0}
239
-
240
- # Load dataset (you can process it here)
241
- # dataset = load_dataset(dataset_name, split="train")
242
-
243
- # Load tokenizer and model with QLoRA configuration
244
- compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
245
-
246
- bnb_config = BitsAndBytesConfig(
247
- load_in_4bit=use_4bit,
248
- bnb_4bit_quant_type=bnb_4bit_quant_type,
249
- bnb_4bit_compute_dtype=compute_dtype,
250
- bnb_4bit_use_double_quant=use_nested_quant,
251
- )
252
-
253
- # Check GPU compatibility with bfloat16
254
- if compute_dtype == torch.float16 and use_4bit:
255
- major, _ = torch.cuda.get_device_capability()
256
- if major >= 8:
257
- print("=" * 80)
258
- print("Your GPU supports bfloat16: accelerate training with bf16=True")
259
- print("=" * 80)
260
-
261
- # Load base model
262
- model = AutoModelForCausalLM.from_pretrained(
263
- model_name,
264
- quantization_config=bnb_config,
265
- device_map=device_map
266
- )
267
- model.config.use_cache = False
268
- model.config.pretraining_tp = 1
269
-
270
- # Load LLaMA tokenizer
271
- tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
272
- tokenizer.pad_token = tokenizer.eos_token
273
- tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training
274
-
275
- # Load LoRA configuration
276
- peft_config = LoraConfig(
277
- lora_alpha=lora_alpha,
278
- lora_dropout=lora_dropout,
279
- r=lora_r,
280
- bias="none",
281
- task_type="CAUSAL_LM",
282
- )
283
-
284
- # Set training parameters
285
- training_arguments = TrainingArguments(
286
- output_dir=output_dir,
287
- num_train_epochs=num_train_epochs,
288
- per_device_train_batch_size=per_device_train_batch_size,
289
- gradient_accumulation_steps=gradient_accumulation_steps,
290
- optim=optim,
291
- save_steps=save_steps,
292
- logging_steps=logging_steps,
293
- learning_rate=learning_rate,
294
- weight_decay=weight_decay,
295
- fp16=fp16,
296
- bf16=bf16,
297
- max_grad_norm=max_grad_norm,
298
- max_steps=max_steps,
299
- warmup_ratio=warmup_ratio,
300
- group_by_length=group_by_length,
301
- lr_scheduler_type=lr_scheduler_type,
302
- report_to="tensorboard"
303
- )
304
-
305
- def extract_between_inst_and_newline(text):
306
- start_tag = "[/INST]"
307
- end_char = "\n"
308
-
309
- start_index = text.find(start_tag)
310
-
311
- if start_index != -1:
312
- end_index = text.find(end_char, start_index)
313
- if end_index != -1:
314
- extracted_text = text[start_index + len(start_tag):end_index]
315
- return extracted_text.strip()
316
-
317
- return None
318
-
319
- import re
320
- from functools import lru_cache
321
-
322
- @lru_cache
323
- def extract_classification_and_remark(output):
324
- classification_match = re.search(r'Classification: (.*?)\n', output)
325
- remark_match = re.search(r'Remark: (.*?)$', output)
326
-
327
- classification = classification_match.group(1) if classification_match else None
328
- remark = remark_match.group(1) if remark_match else None
329
-
330
- return classification, remark
331
-
332
- # Ignore warnings
333
- logging.set_verbosity(logging.CRITICAL)
334
-
335
- # Run text generation pipeline with our next model
336
- prompt = '''Can you classify the human input as either happy, sad, angry, surprised, confused or neutral and tell me why it was classified as such in one short sentence.
337
- Don't reply anything besides the classification and the remark. Separate the classificaion and remark with :
338
- Human input: {}'''
339
-
340
- def process_comment(comment):
341
- formatted_prompt = prompt.format(comment)
342
- pipe = pipeline(task="text2text-generation", model=model, tokenizer=tokenizer, max_length=150)
343
- result = pipe(f"<s>[INST] {formatted_prompt} [/INST]")
344
- extract_output = result[0]['generated_text']
345
- classification, remark = extract_classification_and_remark(extract_output)
346
- return comment, classification, remark
347
-
348
- import matplotlib.pyplot as plt
349
- import seaborn as sns
350
-
351
- def return_distribution(new_formatted_df):
352
- # Assuming your DataFrame is named 'df'
353
- sentiment_counts = new_formatted_df['classification'].value_counts()
354
- fig = plt.figure()
355
- sns.barplot(x=sentiment_counts.index, y=sentiment_counts.values)
356
- plt.xlabel('Sentiment')
357
- plt.ylabel('Count')
358
- plt.title('Sentiment Distribution')
359
- return fig
360
-
361
- from wordcloud import WordCloud
362
-
363
- def return_highest_sentiment_worldcloud(new_formatted_df, sentiment):
364
- # Create a word cloud for a specific sentiment, e.g., 'happy'
365
- happy_comments = new_formatted_df[new_formatted_df['classification'] == sentiment]['comments']
366
- wordcloud = WordCloud(width=800, height=400, background_color='white').generate(' '.join(happy_comments))
367
- fig = plt.figure(figsize=(10, 5))
368
- plt.imshow(wordcloud, interpolation='bilinear')
369
- plt.axis('off')
370
- plt.title('Word Cloud for the Strongest Sentiment')
371
- return fig
372
-
373
- import pandas as pd
374
-
375
- def concatenate_remarks_based_on_classification(dataset):
376
-
377
- # Create an empty dictionary to store concatenated remarks for each classification type.
378
- concatenated_remarks = {}
379
-
380
- # Iterate through the dataset to concatenate remarks.
381
- for index, row in dataset.iterrows():
382
- classification = row['classification']
383
- remarks = row['remark']
384
-
385
- # Check if the classification exists in the dictionary.
386
- if classification in concatenated_remarks:
387
- if remarks is not None:
388
- concatenated_remarks[classification] += ' ' + str(remarks)
389
- else:
390
- if remarks is not None:
391
- concatenated_remarks[classification] = str(remarks)
392
-
393
- # Create a new DataFrame with the concatenated remarks.
394
- concatenated_remarks_df = pd.DataFrame(list(concatenated_remarks.items()), columns=['classification', 'concatenated_remarks'])
395
-
396
- return concatenated_remarks_df
397
-
398
- # !pip install dask -q
399
-
400
- # Run text generation pipeline with our next model
401
- prompt1 = '''Can you summarize the following text in a paragraph of no more than 100 words. Don't respond with anything besides the summary.
402
- Human input: {}'''
403
-
404
- def summarize_text(comment):
405
- formatted_prompt = prompt1.format(comment)
406
- new_pipe = pipeline(task="text2text-generation", model=model, tokenizer=tokenizer, max_length=3000)
407
- new_result = new_pipe(f"<s>[INST] {formatted_prompt} [/INST]")
408
- return new_result
409
-
410
- ## Function for first tab
411
-
412
- import numpy as np
413
- from concurrent.futures import ThreadPoolExecutor
414
- import dask.dataframe as dd
415
- from dask.distributed import Client, LocalCluster
416
- # from multiprocessing import Pool
417
- # num_processes = 4
418
-
419
-
420
- # Import necessary libraries and functions here
421
- # return_df = pd.DataFrame()
422
- # final_analysed_df = pd.DataFrame() # Initialize as None at the global scope
423
-
424
- # Define a Gradio interface
425
- def sentiment_distribution_interface(video_id):
426
- # global final_analysed_df
427
- # global unique_classifications
428
-
429
-
430
- return_df = pd.DataFrame()
431
- # Call the execution function with the video_id
432
- return_df = execution_function(video_id)
433
- print(return_df.head())
434
-
435
- from concurrent.futures import ThreadPoolExecutor
436
-
437
- def process_row(row): #3.9s
438
- comment, classification, remark = process_comment(row.comments)
439
- return comment, classification, remark
440
-
441
- with ThreadPoolExecutor(max_workers=4) as executor: # Adjust the number of workers as needed
442
- results = list(executor.map(process_row, return_df.itertuples()))
443
-
444
- print(type(results))
445
- print(results)
446
-
447
- print("__________________________________________________________________")
448
-
449
- comments, classification, remark = zip(*results)
450
-
451
- # Create a DataFrame from the separated data
452
- df = pd.DataFrame({'comments': comments, 'classification': classification, 'remark': remark})
453
-
454
- print(df.head())
455
-
456
- print("__________________________________________________________________")
457
-
458
- plot = return_distribution(df) # Modify this line to capture the plot
459
-
460
- word_cloud = return_highest_sentiment_worldcloud(df, df['classification'].value_counts().idxmax())
461
-
462
- df.to_csv('processed_comments.csv', index=False) # index=False prevents writing the row numbers as a column
463
-
464
- #concatinating remarks for different sentiments
465
- # concatenated_remarks_df = concatenate_remarks_based_on_classification(df)
466
- # print(concatenated_remarks_df)
467
-
468
- # final_analysed_df = df
469
-
470
- return plot , word_cloud # Return the plot
471
-
472
- # Function for Second Tab
473
-
474
- def function_for_second_tab(input_val):
475
-
476
- final_analysed_df = pd.read_csv('processed_comments.csv')
477
- final_analysed_df = pd.DataFrame(final_analysed_df)
478
- print(final_analysed_df.head())
479
-
480
- word_cloud = return_highest_sentiment_worldcloud(final_analysed_df, input_val)
481
-
482
- concatenated_remarks_df = concatenate_remarks_based_on_classification(final_analysed_df)
483
-
484
- comments = concatenated_remarks_df.loc[concatenated_remarks_df['classification'] == 'Happy', 'concatenated_remarks'].values[0]
485
-
486
- summarized_text = summarize_text(comments)
487
-
488
- extract_output_summary = summarized_text[0]['generated_text']
489
-
490
- final_extract = extract_output_summary.split('[/INST]')[1].strip()
491
-
492
- return word_cloud, final_extract
493
-
494
- # # Define the first tab
495
- outputs = [gr.Plot(), gr.Plot()]
496
- iface = gr.Interface(fn=sentiment_distribution_interface, inputs="text", outputs=outputs)
497
-
498
-
499
- # # Define the second tab
500
- output_second_tab = [gr.Plot(), "text"]
501
- inputs = "text"
502
-
503
- description = ("Enter the sentiment for which you want a detailed report")
504
- app2 = gr.Interface(fn=function_for_second_tab, inputs=inputs, outputs=output_second_tab, description = description)
505
-
506
- # launch the app
507
- demo = gr.TabbedInterface([iface, app2], ["Welcome page", "Visualization page"])
508
- demo.queue().launch()