In [1]:
pip install transformers

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m32.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m18.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m33.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m37.2 MB/s[0m eta [36m0:00:0

## Import libraries

In [27]:
from transformers import pipeline

import torch
import torch.nn.functional as F

from transformers import AutoTokenizer, AutoModelForSequenceClassification, DistilBertForSequenceClassification

## Sentiment Analysis

In [3]:
classifier = pipeline("sentiment-analysis")

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


Downloading (…)lve/main/config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

No CUDA runtime is found, using CUDA_HOME='/usr/local/cuda'
Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


In [4]:
results = classifier(["We are very happy to show you the Hugging Face transformers library", "We hope you don't hate it"])

In [5]:
results

[{'label': 'POSITIVE', 'score': 0.999786913394928},
 {'label': 'POSITIVE', 'score': 0.831964373588562}]

In [6]:
model_name = "distilbert-base-uncased-finetuned-sst-2-english"

In [7]:
classifier1 = pipeline("sentiment-analysis", model = model_name)

In [8]:
results = classifier1(["We are very happy to show you the Hugging Face transformers library", "We hope you don't hate it"])

In [9]:
results

[{'label': 'POSITIVE', 'score': 0.999786913394928},
 {'label': 'POSITIVE', 'score': 0.831964373588562}]

In [10]:
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [11]:
classifier2 = pipeline("sentiment-analysis", model = model_name)

In [12]:
tokens = tokenizer.tokenize("We are very happy to show you the Hugging Face transformers library")
token_ids = tokenizer.convert_tokens_to_ids(tokens)
token_ids_direct = tokenizer("We are very happy to show you the Hugging Face transformers library")

In [13]:
print(f" tokens: {tokens}")
print(f" token ids: {token_ids}")
print(f" token ids (direct): {token_ids_direct}")

 tokens: ['we', 'are', 'very', 'happy', 'to', 'show', 'you', 'the', 'hugging', 'face', 'transformers', 'library']
 token ids: [2057, 2024, 2200, 3407, 2000, 2265, 2017, 1996, 17662, 2227, 19081, 3075]
 token ids (direct): {'input_ids': [101, 2057, 2024, 2200, 3407, 2000, 2265, 2017, 1996, 17662, 2227, 19081, 3075, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [14]:
X_train = ["We are very happy to show you the Hugging Face transformers library", "I hate it"]

In [15]:
batch = tokenizer(X_train, padding = True, truncation = True, max_length = 512, return_tensors = "pt")

In [16]:
print(batch)

{'input_ids': tensor([[  101,  2057,  2024,  2200,  3407,  2000,  2265,  2017,  1996, 17662,
          2227, 19081,  3075,   102],
        [  101,  1045,  5223,  2009,   102,     0,     0,     0,     0,     0,
             0,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]])}


In [17]:
with torch.no_grad():
  outputs = model(**batch, labels = torch.tensor([1, 0]))
  print(outputs)
  predictions = F.softmax(outputs.logits, dim = 1)
  print(predictions)
  labels = torch.argmax(predictions, dim = 1)
  print(labels)
  labels = [model.config.id2label[label_id] for label_id in labels.tolist()]
  print(labels)

SequenceClassifierOutput(loss=tensor(0.0003), logits=tensor([[-4.1038,  4.3497],
        [ 4.3940, -3.5347]]), hidden_states=None, attentions=None)
tensor([[2.1309e-04, 9.9979e-01],
        [9.9964e-01, 3.6016e-04]])
tensor([1, 0])
['POSITIVE', 'NEGATIVE']


In [18]:
save_directory = "saved"
tokenizer.save_pretrained(save_directory)
model.save_pretrained(save_directory)

tokenizer = AutoTokenizer.from_pretrained(save_directory)

In [19]:
model = AutoModelForSequenceClassification.from_pretrained(save_directory)

## Another model

In [20]:
model1_name = "oliverguhr/german-sentiment-bert"

In [21]:
model_german = AutoModelForSequenceClassification.from_pretrained(model1_name)
tokenizer_german = AutoTokenizer.from_pretrained(model1_name)

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/161 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/255k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [22]:
X_train_german = ["Guten Abend", "Wie heißt du?", "Wiederholen Sie bitte.", "Keine sorge."]

In [23]:
batch = tokenizer(X_train_german, padding = True, truncation = True, max_length = 512, return_tensors = "pt")
print(batch)

{'input_ids': tensor([[  101,  9535,  2368, 14863,  4859,   102,     0,     0,     0,     0,
             0,     0],
        [  101, 15536,  2063,  2002,  2072, 19310,  2102,  4241,  1029,   102,
             0,     0],
        [  101, 15536, 14728, 25032,  9890,  2078,  9033,  2063,  2978,  2618,
          1012,   102],
        [  101, 26679,  2638,  2061, 20800,  1012,   102,     0,     0,     0,
             0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]])}


In [24]:
#batch = tokenizer(X_train_german, padding = True, truncation = True, max_length = 512)
#batch = torch.tensor(batch["input_ids"])
#print(batch)

In [25]:
with torch.no_grad():
  outputs_german = model_german(**batch)
  print(outputs)
  predictions_german = F.softmax(outputs_german.logits, dim = 1)
  print(predictions_german)
  labels_german = torch.argmax(predictions_german, dim = 1)
  print(labels_german)
  labels_german = [model_german.config.id2label[label_id] for label_id in labels_german.tolist()]
  print(labels_german)

SequenceClassifierOutput(loss=tensor(0.0003), logits=tensor([[-4.1038,  4.3497],
        [ 4.3940, -3.5347]]), hidden_states=None, attentions=None)
tensor([[2.0787e-02, 1.2513e-01, 8.5408e-01],
        [2.4477e-02, 8.2572e-01, 1.4981e-01],
        [5.3152e-05, 1.4655e-04, 9.9980e-01],
        [2.0299e-05, 6.2296e-05, 9.9992e-01]])
tensor([2, 1, 2, 2])
['neutral', 'negative', 'neutral', 'neutral']


## Testing my own tuned model shared to Hugging Face

In [26]:
model_name2 = "ayethuzar/tuned-for-patentability"

In [29]:
model_at = DistilBertForSequenceClassification.from_pretrained(model_name2, num_labels=2)
tokenizer_at = AutoTokenizer.from_pretrained(model_name2)

Downloading (…)okenizer_config.json:   0%|          | 0.00/320 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [30]:
with torch.no_grad():
  outputs_at = model_at(**batch)
  print(outputs_at)
  predictions_at = F.softmax(outputs_at.logits, dim = 1)
  print(predictions_at)
  labels_at = torch.argmax(predictions_at, dim = 1)
  print(labels_at)


SequenceClassifierOutput(loss=None, logits=tensor([[-1.5925,  1.0365],
        [-1.5714,  1.0243],
        [-1.5936,  1.0340],
        [-1.5936,  1.0302]]), hidden_states=None, attentions=None)
tensor([[0.0673, 0.9327],
        [0.0694, 0.9306],
        [0.0674, 0.9326],
        [0.0676, 0.9324]])
tensor([1, 1, 1, 1])


References:

https://huggingface.co/docs/transformers/main_classes/pipelines