Spaces:

Aniruddha21
/

Llama_2_trial_upload

Runtime error

App Files Files Community

Aniruddha21 commited on Nov 1, 2023

Commit

e7947f0

•

1 Parent(s): 8751bf3

Delete fine_tuned_llama_2_for_comment_analysis.py

Browse files

Files changed (1) hide show

fine_tuned_llama_2_for_comment_analysis.py +0 -508

fine_tuned_llama_2_for_comment_analysis.py DELETED Viewed

@@ -1,508 +0,0 @@
-# -*- coding: utf-8 -*-
-"""Fine Tuned Llama 2 for Comment Analysis
-Automatically generated by Colaboratory.
-Original file is located at
-    https://colab.research.google.com/drive/1NX5z-wVpsEp8UigB0q7vZSZMFRa6nnEE
-##**Extract Youtube Comments**
-"""
-# !pip uninstall gradio
-# !pip3 install gradio -q
-# !pip install --upgrade fastapi -q
-# !pip install typing-extensions --upgrade
-# import locale
-# locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')
-# import locale
-# locale.getpreferredencoding = lambda: "UTF-8"
-# !pip3 install typing-extensions==4.2.0
-# !pip3 install gradio -q
-# !pip3 install --upgrade tensorflow
-import pandas as pd
-import gradio as gr
-from googleapiclient.discovery import build
-import csv
-# import gradio as gr
-from PIL import Image
-import io
-api_key = 'AIzaSyANfQYiumNUfJ8_YaDg-Hfr0BRXFhXnbvQ'
-def video_comments(video_id):
-    # Create a CSV file to store comments
-    with open('comments.csv', 'w', newline='', encoding='utf-8') as csvfile:
-        fieldnames = ['Comment']
-        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
-        writer.writeheader()
-        # Counter to limit the number of comments
-        comment_count = 0
-        # creating youtube resource object
-        youtube = build('youtube', 'v3', developerKey=api_key)
-        # retrieve youtube video results
-        video_response = youtube.commentThreads().list(
-            part='snippet,replies',
-            videoId=video_id,
-            maxResults=100  # Adjust the number of comments per page as needed
-        ).execute()
-        # iterate video response
-        while video_response:
-            # extracting required info from each result object
-            for item in video_response['items']:
-                # Extracting comments
-                comment = item['snippet']['topLevelComment']['snippet']['textDisplay']
-                # Write the comment to the CSV file
-                writer.writerow({'Comment': comment})
-                comment_count += 1
-                # Check if the maximum comment count is reached
-                if comment_count >= 50:
-                    return
-            # Again repeat
-            if 'nextPageToken' in video_response:
-                video_response = youtube.commentThreads().list(
-                    part='snippet,replies',
-                    videoId=video_id,
-                    pageToken=video_response['nextPageToken'],
-                    maxResults=100  # Adjust the number of comments per page as needed
-                ).execute()
-            else:
-                break
-def execution_function(input):
-  # Initialize a counter for deleted rows
-  deleted_row_count = 0
-  video_comments(input)
-  # calling the comment file created above
-  file_path = "/content/comments.csv"
-  df = pd.read_csv(file_path)
-  # Rename the column name to 'comments'
-  df.rename(columns={'Comment': 'comments'}, inplace=True)
-  # Get the first 300 comments for quick analysis
-  df = df.head(10)
-  return df
-  # return_distribution()
-# comments_df = execution_function("6ydFDwv-n8w")
-# comments_df = comments_df.head(20)
-# comments_df.head()
-"""##**Fine - tune Llama 2**
-IMP: This notebook runs on a T4 GPU.
-"""
-# !pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7
-import os
-import torch
-from datasets import load_dataset
-from transformers import (
-    AutoModelForCausalLM,
-    AutoTokenizer,
-    BitsAndBytesConfig,
-    HfArgumentParser,
-    TrainingArguments,
-    pipeline,
-    logging,
-)
-from peft import LoraConfig, PeftModel
-from trl import SFTTrainer
-# The model that you want to train from the Hugging Face hub
-model_name = "NousResearch/Llama-2-7b-chat-hf"
-# The instruction dataset to use
-# dataset_name = "mlabonne/guanaco-llama2-1k"
-# Fine-tuned model name
-# new_model = "llama-2-7b-miniguanaco"
-################################################################################
-# QLoRA parameters
-################################################################################
-# LoRA attention dimension
-lora_r = 64
-# Alpha parameter for LoRA scaling
-lora_alpha = 16
-# Dropout probability for LoRA layers
-lora_dropout = 0.1
-################################################################################
-# bitsandbytes parameters
-################################################################################
-# Activate 4-bit precision base model loading
-use_4bit = True
-# Compute dtype for 4-bit base models
-bnb_4bit_compute_dtype = "float16"
-# Quantization type (fp4 or nf4)
-bnb_4bit_quant_type = "nf4"
-# Activate nested quantization for 4-bit base models (double quantization)
-use_nested_quant = False
-################################################################################
-# TrainingArguments parameters
-################################################################################
-# Output directory where the model predictions and checkpoints will be stored
-output_dir = "./results"
-# Number of training epochs
-num_train_epochs = 1
-# Enable fp16/bf16 training (set bf16 to True with an A100)
-fp16 = False
-bf16 = False
-# Batch size per GPU for training
-per_device_train_batch_size = 4
-# Batch size per GPU for evaluation
-per_device_eval_batch_size = 4
-# Number of update steps to accumulate the gradients for
-gradient_accumulation_steps = 1
-# Enable gradient checkpointing
-gradient_checkpointing = True
-# Maximum gradient normal (gradient clipping)
-max_grad_norm = 0.3
-# Initial learning rate (AdamW optimizer)
-learning_rate = 2e-4
-# Weight decay to apply to all layers except bias/LayerNorm weights
-weight_decay = 0.001
-# Optimizer to use
-optim = "paged_adamw_32bit"
-# Learning rate schedule
-lr_scheduler_type = "cosine"
-# Number of training steps (overrides num_train_epochs)
-max_steps = -1
-# Ratio of steps for a linear warmup (from 0 to learning rate)
-warmup_ratio = 0.03
-# Group sequences into batches with same length
-# Saves memory and speeds up training considerably
-group_by_length = True
-# Save checkpoint every X updates steps
-save_steps = 0
-# Log every X updates steps
-logging_steps = 25
-################################################################################
-# SFT parameters
-################################################################################
-# Maximum sequence length to use
-max_seq_length = None
-# Pack multiple short examples in the same input sequence to increase efficiency
-packing = False
-# Load the entire model on the GPU 0
-device_map = {"": 0}
-# Load dataset (you can process it here)
-# dataset = load_dataset(dataset_name, split="train")
-# Load tokenizer and model with QLoRA configuration
-compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
-bnb_config = BitsAndBytesConfig(
-    load_in_4bit=use_4bit,
-    bnb_4bit_quant_type=bnb_4bit_quant_type,
-    bnb_4bit_compute_dtype=compute_dtype,
-    bnb_4bit_use_double_quant=use_nested_quant,
-)
-# Check GPU compatibility with bfloat16
-if compute_dtype == torch.float16 and use_4bit:
-    major, _ = torch.cuda.get_device_capability()
-    if major >= 8:
-        print("=" * 80)
-        print("Your GPU supports bfloat16: accelerate training with bf16=True")
-        print("=" * 80)
-# Load base model
-model = AutoModelForCausalLM.from_pretrained(
-    model_name,
-    quantization_config=bnb_config,
-    device_map=device_map
-)
-model.config.use_cache = False
-model.config.pretraining_tp = 1
-# Load LLaMA tokenizer
-tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
-tokenizer.pad_token = tokenizer.eos_token
-tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training
-# Load LoRA configuration
-peft_config = LoraConfig(
-    lora_alpha=lora_alpha,
-    lora_dropout=lora_dropout,
-    r=lora_r,
-    bias="none",
-    task_type="CAUSAL_LM",
-)
-# Set training parameters
-training_arguments = TrainingArguments(
-    output_dir=output_dir,
-    num_train_epochs=num_train_epochs,
-    per_device_train_batch_size=per_device_train_batch_size,
-    gradient_accumulation_steps=gradient_accumulation_steps,
-    optim=optim,
-    save_steps=save_steps,
-    logging_steps=logging_steps,
-    learning_rate=learning_rate,
-    weight_decay=weight_decay,
-    fp16=fp16,
-    bf16=bf16,
-    max_grad_norm=max_grad_norm,
-    max_steps=max_steps,
-    warmup_ratio=warmup_ratio,
-    group_by_length=group_by_length,
-    lr_scheduler_type=lr_scheduler_type,
-    report_to="tensorboard"
-)
-def extract_between_inst_and_newline(text):
-    start_tag = "[/INST]"
-    end_char = "\n"
-    start_index = text.find(start_tag)
-    if start_index != -1:
-        end_index = text.find(end_char, start_index)
-        if end_index != -1:
-            extracted_text = text[start_index + len(start_tag):end_index]
-            return extracted_text.strip()
-    return None
-import re
-from functools import lru_cache
-@lru_cache
-def extract_classification_and_remark(output):
-    classification_match = re.search(r'Classification: (.*?)\n', output)
-    remark_match = re.search(r'Remark: (.*?)$', output)
-    classification = classification_match.group(1) if classification_match else None
-    remark = remark_match.group(1) if remark_match else None
-    return classification, remark
-# Ignore warnings
-logging.set_verbosity(logging.CRITICAL)
-# Run text generation pipeline with our next model
-prompt = '''Can you classify the human input as either happy, sad, angry, surprised, confused or neutral and tell me why it was classified as such in one short sentence.
-Don't reply anything besides the classification and the remark. Separate the classificaion and remark with :
-Human input: {}'''
-def process_comment(comment):
-    formatted_prompt = prompt.format(comment)
-    pipe = pipeline(task="text2text-generation", model=model, tokenizer=tokenizer, max_length=150)
-    result = pipe(f"<s>[INST] {formatted_prompt} [/INST]")
-    extract_output = result[0]['generated_text']
-    classification, remark = extract_classification_and_remark(extract_output)
-    return comment, classification, remark
-import matplotlib.pyplot as plt
-import seaborn as sns
-def return_distribution(new_formatted_df):
-  # Assuming your DataFrame is named 'df'
-  sentiment_counts = new_formatted_df['classification'].value_counts()
-  fig = plt.figure()
-  sns.barplot(x=sentiment_counts.index, y=sentiment_counts.values)
-  plt.xlabel('Sentiment')
-  plt.ylabel('Count')
-  plt.title('Sentiment Distribution')
-  return fig
-from wordcloud import WordCloud
-def return_highest_sentiment_worldcloud(new_formatted_df, sentiment):
-  # Create a word cloud for a specific sentiment, e.g., 'happy'
-  happy_comments = new_formatted_df[new_formatted_df['classification'] == sentiment]['comments']
-  wordcloud = WordCloud(width=800, height=400, background_color='white').generate(' '.join(happy_comments))
-  fig = plt.figure(figsize=(10, 5))
-  plt.imshow(wordcloud, interpolation='bilinear')
-  plt.axis('off')
-  plt.title('Word Cloud for the Strongest Sentiment')
-  return fig
-import pandas as pd
-def concatenate_remarks_based_on_classification(dataset):
-    # Create an empty dictionary to store concatenated remarks for each classification type.
-    concatenated_remarks = {}
-    # Iterate through the dataset to concatenate remarks.
-    for index, row in dataset.iterrows():
-        classification = row['classification']
-        remarks = row['remark']
-        # Check if the classification exists in the dictionary.
-        if classification in concatenated_remarks:
-            if remarks is not None:
-                concatenated_remarks[classification] += ' ' + str(remarks)
-        else:
-            if remarks is not None:
-                concatenated_remarks[classification] = str(remarks)
-    # Create a new DataFrame with the concatenated remarks.
-    concatenated_remarks_df = pd.DataFrame(list(concatenated_remarks.items()), columns=['classification', 'concatenated_remarks'])
-    return concatenated_remarks_df
-# !pip install dask -q
-# Run text generation pipeline with our next model
-prompt1 = '''Can you summarize the following text in a paragraph of no more than 100 words. Don't respond with anything besides the summary.
-Human input: {}'''
-def summarize_text(comment):
-    formatted_prompt = prompt1.format(comment)
-    new_pipe = pipeline(task="text2text-generation", model=model, tokenizer=tokenizer, max_length=3000)
-    new_result = new_pipe(f"<s>[INST] {formatted_prompt} [/INST]")
-    return new_result
-## Function for first tab
-import numpy as np
-from concurrent.futures import ThreadPoolExecutor
-import dask.dataframe as dd
-from dask.distributed import Client, LocalCluster
-# from multiprocessing import Pool
-# num_processes = 4
-# Import necessary libraries and functions here
-# return_df = pd.DataFrame()
-# final_analysed_df = pd.DataFrame()  # Initialize as None at the global scope
-# Define a Gradio interface
-def sentiment_distribution_interface(video_id):
-    # global final_analysed_df
-    # global unique_classifications
-    return_df = pd.DataFrame()
-    # Call the execution function with the video_id
-    return_df = execution_function(video_id)
-    print(return_df.head())
-    from concurrent.futures import ThreadPoolExecutor
-    def process_row(row):     #3.9s
-      comment, classification, remark = process_comment(row.comments)
-      return comment, classification, remark
-    with ThreadPoolExecutor(max_workers=4) as executor:         # Adjust the number of workers as needed
-      results = list(executor.map(process_row, return_df.itertuples()))
-    print(type(results))
-    print(results)
-    print("__________________________________________________________________")
-    comments, classification, remark = zip(*results)
-    # Create a DataFrame from the separated data
-    df = pd.DataFrame({'comments': comments, 'classification': classification, 'remark': remark})
-    print(df.head())
-    print("__________________________________________________________________")
-    plot = return_distribution(df)  # Modify this line to capture the plot
-    word_cloud = return_highest_sentiment_worldcloud(df, df['classification'].value_counts().idxmax())
-    df.to_csv('processed_comments.csv', index=False)  # index=False prevents writing the row numbers as a column
-    #concatinating remarks for different sentiments
-    # concatenated_remarks_df = concatenate_remarks_based_on_classification(df)
-    # print(concatenated_remarks_df)
-    # final_analysed_df = df
-    return plot , word_cloud  # Return the plot
-# Function for Second Tab
-def function_for_second_tab(input_val):
-  final_analysed_df = pd.read_csv('processed_comments.csv')
-  final_analysed_df = pd.DataFrame(final_analysed_df)
-  print(final_analysed_df.head())
-  word_cloud  = return_highest_sentiment_worldcloud(final_analysed_df, input_val)
-  concatenated_remarks_df = concatenate_remarks_based_on_classification(final_analysed_df)
-  comments = concatenated_remarks_df.loc[concatenated_remarks_df['classification'] == 'Happy', 'concatenated_remarks'].values[0]
-  summarized_text = summarize_text(comments)
-  extract_output_summary = summarized_text[0]['generated_text']
-  final_extract = extract_output_summary.split('[/INST]')[1].strip()
-  return word_cloud, final_extract
-# # Define the first tab
-outputs = [gr.Plot(), gr.Plot()]
-iface = gr.Interface(fn=sentiment_distribution_interface, inputs="text", outputs=outputs)
-# # Define the second tab
-output_second_tab = [gr.Plot(), "text"]
-inputs = "text"
-description = ("Enter the sentiment for which you want a detailed report")
-app2 = gr.Interface(fn=function_for_second_tab, inputs=inputs, outputs=output_second_tab, description = description)
-# launch the app
-demo = gr.TabbedInterface([iface, app2], ["Welcome page", "Visualization page"])
-demo.queue().launch()