### Best Use of Open Source Models - HuggingFace

In this track, participants will explore the power of natural language processing using the HuggingFace library. They can build applications for text analysis, sentiment analysis, and more.

# 1. Preprocessing

In [None]:
from google.colab import files

uploaded = files.upload()

Saving TestPlay_modified_modified.csv to TestPlay_modified_modified.csv


In [None]:
import pandas as pd

# Replace 'your_csv_file.csv' with the path to your CSV file
csv_file_path = 'TestPlay_modified_modified.csv'

# Read the CSV file
data = pd.read_csv(csv_file_path)

# Display the DataFrame
data

Unnamed: 0,Character,Line,Gender,Age,Emotion,Characteristic
0,NARRATOR,HARPER sits at a table alone in a room. She is...,,,,
1,FATHER,"Hi, Harper. I’m really happy you came.",,,,
2,HARPER,Happy Father’s Day.,,,,
3,FATHER,"Thanks… wow, you’re growing fast. How long has...",,,,
4,HARPER,It was Christmas.,,,,
...,...,...,...,...,...,...
59,FATHER,Never do anything that keeps you from giving y...,,,,
60,HARPER,I will.,,,,
61,FATHER,"Goodbye, Harper.",,,,
62,HARPER,"Goodbye, Daddy.",,,,


# 2.  Gender Prediction

**accelearte**: a library that enables the same PyTorch code to be run across any distributed configuration

**transformers**: provides APIs and tools to easily download and train state-of-the-art pretrained models

In [None]:
!pip install accelerate==0.21.0
!pip install transformers==4.30.1

Collecting accelerate==0.21.0
  Downloading accelerate-0.21.0-py3-none-any.whl (244 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.21.0
Collecting transformers==4.30.1
  Downloading transformers-4.30.1-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.30.1)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m116.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.15.0
    Uninstalling tokenizers-0.15.0:
      Successfully uninst

In [None]:
import accelerate
import transformers

transformers.__version__, accelerate.__version__

('4.30.1', '0.21.0')

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

**temperature = 0.7**
* Reduced Randomness Compared to Default
* More Conservative Word Selection
* Balance Between Predictability and Creativity
* temperature: how probabilities are scaled
* low tempearture (close to 0): less randomness
* high temeperature: increase randomness
* temperature = 1: default setting

**attention_mask = torch.ones(input_ids.shape)**
* when input sequence contains no padding
* padding token: many NLP models require that all input sequences be of the same length for processing in batches
* attention mask: real data tokens marked with a 1, padding tokens marked with a 0

In [None]:
# Define gender predictions for specific characters
character_gender_mapping = {
    "NARRATOR": "neutral",
    "FATHER": "male",
    "HARPER": "female"
}

def predict_gender_aggregated(character, lines):
    # Check if the character is in the mapping
    if character.upper() in character_gender_mapping:
        return character_gender_mapping[character.upper()]

    # For other characters, perform gender prediction as before
    aggregated_text = " ".join(lines)
    input_text = f"Character: {character}. Dialogue: {aggregated_text}. Gender:"
    input_ids = tokenizer.encode(input_text, return_tensors='pt')

    # Create an attention mask
    attention_mask = torch.ones(input_ids.shape)

    output = model.generate(input_ids, attention_mask=attention_mask, max_length=60, do_sample=True, temperature=0.7)
    result = tokenizer.decode(output[0], skip_special_tokens=True)

    # Extract gender prediction as 'male' or 'female' (assuming it's one of these two)
    if 'male' in result.lower():
        gender_prediction = 'male'
    elif 'female' in result.lower():
        gender_prediction = 'female'
    else:
        gender_prediction = 'unknown'  # Handle cases where gender isn't explicitly mentioned

    return gender_prediction

# Aggregate lines for each character
character_lines = data.groupby('Character')['Line'].apply(list)

# Create a Series for character genders with the correct character names
character_genders = character_lines.index.to_series().apply(lambda character: predict_gender_aggregated(character, character_lines[character]))

# Map the predicted gender back to the original DataFrame
data['Gender'] = data['Character'].map(character_genders)
data

Unnamed: 0,Character,Line,Gender,Age,Emotion,Characteristic
0,NARRATOR,HARPER sits at a table alone in a room. She is...,neutral,,,
1,FATHER,"Hi, Harper. I’m really happy you came.",male,,,
2,HARPER,Happy Father’s Day.,female,,,
3,FATHER,"Thanks… wow, you’re growing fast. How long has...",male,,,
4,HARPER,It was Christmas.,female,,,
...,...,...,...,...,...,...
59,FATHER,Never do anything that keeps you from giving y...,male,,,
60,HARPER,I will.,female,,,
61,FATHER,"Goodbye, Harper.",male,,,
62,HARPER,"Goodbye, Daddy.",female,,,


# 3. Sentimental Analysis

## 3.1. HuggingFace Library

In [None]:
from transformers import pipeline

# Initialize sentiment analysis pipeline
sentiment_pipeline = pipeline('sentiment-analysis')

def addEmotionColumn(df):
    # Apply sentiment analysis to each line and store the result
    emotions = []
    for line in df['Line']:
        sentiment_result = sentiment_pipeline(line)
        # Assuming we take the label of the first result
        emotion = sentiment_result[0]['label']
        emotions.append(emotion)

    # Add the emotion results as a new column in the DataFrame
    df['Emotion'] = emotions
    return df

# Add emotion column
data = addEmotionColumn(data)
data

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


Unnamed: 0,Character,Line,Gender,Age,Emotion,Characteristic
0,NARRATOR,HARPER sits at a table alone in a room. She is...,neutral,,NEGATIVE,
1,FATHER,"Hi, Harper. I’m really happy you came.",male,,POSITIVE,
2,HARPER,Happy Father’s Day.,female,,POSITIVE,
3,FATHER,"Thanks… wow, you’re growing fast. How long has...",male,,POSITIVE,
4,HARPER,It was Christmas.,female,,POSITIVE,
...,...,...,...,...,...,...
59,FATHER,Never do anything that keeps you from giving y...,male,,NEGATIVE,
60,HARPER,I will.,female,,POSITIVE,
61,FATHER,"Goodbye, Harper.",male,,NEGATIVE,
62,HARPER,"Goodbye, Daddy.",female,,NEGATIVE,


## 3.2. Fine-Tuning

### 3.2.1. Get Dataset

#### 3.2.1.1. HuggingFace

In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.5-py3-none-any.whl (7.8 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow-hotfix, dill, multiprocess, datasets
Successfully installed datasets-2.15.0 dill-0.3.7 multiprocess-0.70.15 pyarrow-hotfix-0.5


In [None]:
from datasets import load_dataset

dataset = load_dataset("financial_phrasebank", "sentences_allagree")

# Convert the dataset to a pandas DataFrame
df = pd.DataFrame(dataset['train'])
df

Downloading builder script:   0%|          | 0.00/6.04k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/13.7k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/8.88k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/682k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2264 [00:00<?, ? examples/s]

Unnamed: 0,sentence,label
0,"According to Gran , the company has no plans t...",1
1,"For the last quarter of 2010 , Componenta 's n...",2
2,"In the third quarter of 2010 , net sales incre...",2
3,Operating profit rose to EUR 13.1 mn from EUR ...,2
4,"Operating profit totalled EUR 21.1 mn , up fro...",2
...,...,...
2259,Operating result for the 12-month period decre...,0
2260,HELSINKI Thomson Financial - Shares in Cargote...,0
2261,LONDON MarketWatch -- Share prices ended lower...,0
2262,Operating profit fell to EUR 35.4 mn from EUR ...,0


In [None]:
from sklearn.model_selection import train_test_split

# Split the data into train and remaining data
train_df, remaining_df = train_test_split(df, test_size=0.2, random_state=42)

# Split the remaining data into validation and test
validation_df, test_df = train_test_split(remaining_df, test_size=0.5, random_state=42)

#### 3.2.1.2. Kaggle

In [None]:
!pip install kaggle

import os

# Set your Kaggle API credentials
os.environ["KAGGLE_USERNAME"] = "ddiddu"
os.environ["KAGGLE_KEY"] = "95135e8fe8461f311642da97bc623dd8"

# Download the dataset using the Kaggle API
!kaggle datasets download -d jp797498e/twitter-entity-sentiment-analysis
!ls

In [None]:
# Unzip the downloaded dataset
!unzip twitter-entity-sentiment-analysis.zip

In [None]:
df = pd.read_csv("twitter_training.csv")
test_df = pd.read_csv("twitter_validation.csv")

# Assuming you have a DataFrame df with the data
# Rename columns
df.rename(columns={'2401': 'Tweet ID', 'Borderlands': 'entity', 'Positive': 'label', 'im getting on borderlands and i will murder you all ,': 'sentence'}, inplace=True)
test_df.rename(columns={'3364': 'Tweet ID', 'BorderFacebooklands': 'entity', 'Irrelevant': 'label', 'I mentioned on Facebook that I was struggling for motivation to go for a run the other day, which has been translated by Tom’s great auntie as ‘Hayley can’t get out of bed’ and told to his grandma, who now thinks I’m a lazy, terrible person 🤣': 'sentence'}, inplace=True)

# Replace label values
df['label'] = df['label'].replace({'Positive': 2, 'Neutral': 1, 'Negative': 0})
test_df['label'] = test_df['label'].replace({'Positive': 2, 'Neutral': 1, 'Negative': 0})

# Filter out rows where 'label' is not "Irrelevant"
df = df[df['label'] != 'Irrelevant']
test_df = test_df[test_df['label'] != 'Irrelevant']

# Reset the index of the DataFrame
df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)

test_df

In [None]:
df.groupby(['label']).size()

In [None]:
# Load and preprocess your Kaggle dataset
train_df, validation_df = train_test_split(df, test_size=0.2, random_state=42)
validation_df

### 3.2.2. Remove Unnecessary Words

In [None]:
# import nltk
# nltk.download('book')
# from nltk.book import *
# from nltk.tokenize import sent_tokenize, word_tokenize

In [None]:
# def tokenize_POS(paragraph):
#   words = word_tokenize(paragraph)
#   tagged_words = nltk.pos_tag(words)

#   # Remove not important types of words
#   excluded_tags = ['CC', 'DT', 'IN', 'TO', 'PRP', 'PRP$', 'MD', 'WP', 'WP$', 'WRB']
#   filtered_words = [word for word, pos in tagged_words if pos not in excluded_tags]

#   return ' '.join(filtered_words)

# tokenize_POS(df['sentence'][0])

In [None]:
# # Apply the function to each row in the 'sentence' column of the DataFrame
# df['filtered_sentence'] = df['sentence'].apply(tokenize_POS)

### 3.3.3. Training
* v1 {'eval_loss': 0.3472066819667816,
 'eval_runtime': 1.6455,
 'eval_samples_per_second': 137.95,
 'eval_steps_per_second': 9.116,
 'epoch': 10.0}
* v2 {'eval_loss': 0.22732013463974,
 'eval_runtime': 1.6519,
 'eval_samples_per_second': 137.414,
 'eval_steps_per_second': 9.08,
 'epoch': 10.0}
* v3


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

# Check if GPU is available and set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# Convert DataFrame to Hugging Face Dataset
def df_to_dataset(df):
    return Dataset.from_pandas(df)

train_dataset = df_to_dataset(train_df)
validation_dataset = df_to_dataset(validation_df)
test_dataset = df_to_dataset(test_df)

# Load a pretrained model and tokenizer
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)  # 3 labels: positive, negative, neutral
model.to(device)  # Move the model to GPU

# Tokenize the datasets
def tokenize_function(examples):
    return tokenizer(examples['sentence'], padding='max_length', truncation=True)

print(train_dataset[0])
print(validation_dataset[0])
print(test_dataset[0])

tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_validation = validation_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=10,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_validation,
)

# Train the model
trainer.train()

# Evaluate the model
trainer.evaluate(tokenized_test)

Using device: cuda


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

{'sentence': 'The robust growth was the result of the inclusion of clothing chain Lindex in the Group in December 2007 .', 'label': 2, '__index_level_0__': 316}
{'sentence': 'Unbelievably , the company that makes them - Fiskars Corporation - was formed in 1649 when a Dutch merchant named Peter Thorwoste was given a charter to establish a blast furnace and forging operation in the small Finnish village of Fiskars .', 'label': 1, '__index_level_0__': 2010}
{'sentence': 'Finnish software developer Done Solutions Oyj said its net profit increased to 3.5 mln euro ( $ 4.6 mln ) in 2006 from 2.3 mln euro ( $ 3.0 mln ) in 2005 .', 'label': 2, '__index_level_0__': 124}


Map:   0%|          | 0/1811 [00:00<?, ? examples/s]

Map:   0%|          | 0/226 [00:00<?, ? examples/s]

Map:   0%|          | 0/227 [00:00<?, ? examples/s]

For more details, please read https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http.
Cloning https://huggingface.co/ddiddu/audiobook into local empty directory.


Epoch,Training Loss,Validation Loss
1,No log,0.512281


Epoch,Training Loss,Validation Loss
1,No log,0.512281
2,No log,0.169869
3,No log,0.027977
4,No log,0.109293
5,0.323700,0.014466


In [None]:
model_path = "./financial_phrasebank_model_v1"

# Save the model and tokenizer
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

('./financial_phrasebank_model_v1/tokenizer_config.json',
 './financial_phrasebank_model_v1/special_tokens_map.json',
 './financial_phrasebank_model_v1/vocab.txt',
 './financial_phrasebank_model_v1/added_tokens.json',
 './financial_phrasebank_model_v1/tokenizer.json')

In [None]:
from transformers import pipeline

# Path to your saved model
model_path = "./financial_phrasebank_model_v1"

# Initialize sentiment analysis pipeline with your model
sentiment_pipeline = pipeline('sentiment-analysis', model=model_path, tokenizer=model_path)

In [None]:
def addEmotionColumn(df):
    emotions = []
    for line in df['Line']:
        sentiment_result = sentiment_pipeline(line)
        emotion = sentiment_result[0]['label']
        emotions.append(emotion)

    df['Emotion'] = emotions
    return df

# Apply to your data
data = addEmotionColumn(data)

# Assuming your DataFrame is named 'data'
emotion_mapping = {'LABEL_0': 'negative', 'LABEL_1': 'neutral', 'LABEL_2': 'positive'}
data['Emotion'] = data['Emotion'].map(emotion_mapping)

data

Unnamed: 0,Character,Line,Gender,Age,Emotion,Characteristic
0,NARRATOR,HARPER sits at a table alone in a room. She is...,neutral,,neutral,
1,FATHER,"Hi, Harper. I’m really happy you came.",male,,positive,
2,HARPER,Happy Father’s Day.,female,,positive,
3,FATHER,"Thanks… wow, you’re growing fast. How long has...",male,,positive,
4,HARPER,It was Christmas.,female,,neutral,
...,...,...,...,...,...,...
59,FATHER,Never do anything that keeps you from giving y...,male,,neutral,
60,HARPER,I will.,female,,neutral,
61,FATHER,"Goodbye, Harper.",male,,neutral,
62,HARPER,"Goodbye, Daddy.",female,,neutral,


In [None]:
!zip -r financial_phrasebank_model_v1.zip financial_phrasebank_model_v1/

  adding: financial_phrasebank_model_v1/ (stored 0%)
  adding: financial_phrasebank_model_v1/pytorch_model.bin (deflated 7%)
  adding: financial_phrasebank_model_v1/vocab.txt (deflated 53%)
  adding: financial_phrasebank_model_v1/config.json (deflated 51%)
  adding: financial_phrasebank_model_v1/tokenizer_config.json (deflated 43%)
  adding: financial_phrasebank_model_v1/special_tokens_map.json (deflated 42%)
  adding: financial_phrasebank_model_v1/tokenizer.json (deflated 71%)


In [None]:
!mv financial_phrasebank_model_v1.zip /content/drive/MyDrive/