nicholasKluge commited on
Commit
b67821c
1 Parent(s): a9bec79

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +16 -51
README.md CHANGED
@@ -11,77 +11,42 @@ pipeline_tag: text-classification
11
  tags:
12
  - sentiment-analysis
13
  ---
 
14
 
15
- ## bert-base-cased
16
 
17
- | Epoch | Training Loss | Validation Loss | Accuracy |
18
- |-------|----------------|------------------|----------|
19
- | 1 | 0.304600 | 0.224774 | 0.908200 |
20
- | 2 | 0.138800 | 0.222201 | 0.918200 |
21
- | 3 | 0.080800 | 0.316631 | 0.922200 |
22
-
23
- ## Gpt2-portuguese-small
24
-
25
- | Epoch | Training Loss | Validation Loss | Accuracy |
26
- |-------|---------------|------------------|----------|
27
- | 1 | 0.341800 | 0.241748 | 0.897600 |
28
- | 2 | 0.202500 | 0.224077 | 0.911600 |
29
- | 3 | 0.149300 | 0.239030 | 0.916000 |
30
-
31
- ## nicholasKluge/Teeny-tiny-llama-162m-imdb
32
-
33
- | Epoch | Training Loss | Validation Loss | Accuracy |
34
- |-------|---------------|------------------|----------|
35
- | 1 | 0.344300 | 0.224800 | 0.911400 |
36
- | 2 | 0.149300 | 0.248538 | 0.906200 |
37
- | 3 | 0.081900 | 0.286298 | 0.909600 |
38
 
 
39
  ```python
40
  # IMDB
41
  ! pip install transformers datasets evaluate accelerate -q
42
 
43
  import evaluate
44
  import numpy as np
45
- from huggingface_hub import login
46
  from datasets import load_dataset, Dataset, DatasetDict
47
  from transformers import AutoTokenizer, DataCollatorWithPadding
48
  from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
49
 
50
- # Basic fine-tuning arguments
51
- token="your_token"
52
- task="christykoh/imdb_pt"
53
- model_name="neuralmind/bert-base-portuguese-cased"
54
- output_dir="checkpoint"
55
- learning_rate=4e-5
56
- per_device_train_batch_size=32
57
- per_device_eval_batch_size=32
58
- num_train_epochs=3
59
- weight_decay=0.01
60
  evaluation_strategy="epoch"
61
  save_strategy="epoch"
62
  hub_model_id="nicholasKluge/Teeny-tiny-llama-162m-imdb"
63
 
64
- # Login on the hub to load and push
65
- login(token=token)
66
-
67
  # Load the task
68
- dataset = load_dataset(task)
69
 
70
  # Create a `ModelForSequenceClassification`
71
  model = AutoModelForSequenceClassification.from_pretrained(
72
- model_name,
73
  num_labels=2,
74
  id2label={0: "NEGATIVE", 1: "POSITIVE"},
75
  label2id={"NEGATIVE": 0, "POSITIVE": 1}
76
  )
77
 
78
- tokenizer = AutoTokenizer.from_pretrained(model_name)
79
-
80
- # If model does not have a pad_token, we need to add it
81
- #tokenizer.pad_token = tokenizer._eos_token
82
- #model.config.pad_token_id = model.config.eos_token_id
83
 
84
- # Pre process the dataset
85
  def preprocess_function(examples):
86
  return tokenizer(examples["text"], truncation=True, max_length=256)
87
 
@@ -90,7 +55,7 @@ dataset_tokenized = dataset.map(preprocess_function, batched=True)
90
  # Create a simple data collactor
91
  data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
92
 
93
- # Use accuracy as evaluation metric
94
  accuracy = evaluate.load("accuracy")
95
 
96
  # Function to compute accuracy
@@ -101,12 +66,12 @@ def compute_metrics(eval_pred):
101
 
102
  # Define training arguments
103
  training_args = TrainingArguments(
104
- output_dir=output_dir,
105
- learning_rate=learning_rate,
106
- per_device_train_batch_size=per_device_train_batch_size,
107
- per_device_eval_batch_size=per_device_eval_batch_size,
108
- num_train_epochs=num_train_epochs,
109
- weight_decay=weight_decay,
110
  evaluation_strategy=evaluation_strategy,
111
  save_strategy=save_strategy,
112
  load_best_model_at_end=True,
 
11
  tags:
12
  - sentiment-analysis
13
  ---
14
+ # TeenyTinyLlama-162m-IMDB
15
 
16
+ TeenyTinyLlama is a series of small foundational models trained on Portuguese.
17
 
18
+ This repository contains a version of [TeenyTinyLlama-162m]() fine-tuned on a translated version of the IMDB dataset.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
+
21
  ```python
22
  # IMDB
23
  ! pip install transformers datasets evaluate accelerate -q
24
 
25
  import evaluate
26
  import numpy as np
 
27
  from datasets import load_dataset, Dataset, DatasetDict
28
  from transformers import AutoTokenizer, DataCollatorWithPadding
29
  from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
30
 
31
+
 
 
 
 
 
 
 
 
 
32
  evaluation_strategy="epoch"
33
  save_strategy="epoch"
34
  hub_model_id="nicholasKluge/Teeny-tiny-llama-162m-imdb"
35
 
 
 
 
36
  # Load the task
37
+ dataset = load_dataset("christykoh/imdb_pt")
38
 
39
  # Create a `ModelForSequenceClassification`
40
  model = AutoModelForSequenceClassification.from_pretrained(
41
+ "nicholasKluge/TeenyTinyLlama-162m",
42
  num_labels=2,
43
  id2label={0: "NEGATIVE", 1: "POSITIVE"},
44
  label2id={"NEGATIVE": 0, "POSITIVE": 1}
45
  )
46
 
47
+ tokenizer = AutoTokenizer.from_pretrained("nicholasKluge/TeenyTinyLlama-162m")
 
 
 
 
48
 
49
+ # Preprocess the dataset
50
  def preprocess_function(examples):
51
  return tokenizer(examples["text"], truncation=True, max_length=256)
52
 
 
55
  # Create a simple data collactor
56
  data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
57
 
58
+ # Use accuracy as an evaluation metric
59
  accuracy = evaluate.load("accuracy")
60
 
61
  # Function to compute accuracy
 
66
 
67
  # Define training arguments
68
  training_args = TrainingArguments(
69
+ output_dir="checkpoints",
70
+ learning_rate=4e-5,
71
+ per_device_train_batch_size=16,
72
+ per_device_eval_batch_size=16,
73
+ num_train_epochs=3,
74
+ weight_decay=0.01,
75
  evaluation_strategy=evaluation_strategy,
76
  save_strategy=save_strategy,
77
  load_best_model_at_end=True,