NickyNicky commited on
Commit
ab00469
·
verified ·
1 Parent(s): 8056c9d

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +173 -2
README.md CHANGED
@@ -1,2 +1,173 @@
1
- - oasst2_chatML_Cluster_1: future experts Cluster_1
2
- - Epoch: 7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ datasets:
4
+ - Open-Orca/OpenOrca
5
+ - OpenAssistant/oasst_top1_2023-08-25
6
+ language:
7
+ - bg
8
+ - ca
9
+ - cs
10
+ - da
11
+ - de
12
+ - en
13
+ - es
14
+ - fr
15
+ - hr
16
+ - hu
17
+ - it
18
+ - nl
19
+ - pl
20
+ - pt
21
+ - ro
22
+ - ru
23
+ - sl
24
+ - sr
25
+ - sv
26
+ - uk
27
+
28
+ library_name: transformers
29
+ ---
30
+
31
+
32
+ ![image/png](https://cdn-uploads.huggingface.co/production/uploads/641b435ba5f876fe30c5ae0a/rJ1RxzuE-3gzgCppx-T8f.png)
33
+
34
+ ```
35
+ reference-data-model:
36
+
37
+ datasets:
38
+ - lang: "bg,ca,cs,da,de,en,es,fr,hr,hu,it,nl,pl,pt,ro,ru,sl,sr,sv,uk"
39
+ link: https://huggingface.co/datasets/OpenAssistant/oasst_top1_2023-08-25
40
+
41
+ model:
42
+ - google/gemma-2b-it
43
+ Link:
44
+ https://huggingface.co/google/gemma-2b-it
45
+
46
+
47
+ Eval model:
48
+ - link:
49
+ ...
50
+
51
+ ```
52
+
53
+
54
+ ##
55
+
56
+
57
+ ```Python
58
+ !python -m pip install --upgrade pip
59
+ !pip install "torch>=2.1.1" -U
60
+ !pip install torchaudio==2.2.0
61
+ !pip install -q datasets trl peft bitsandbytes sentencepiece wandb
62
+ !pip install -q accelerate safetensors deepspeed
63
+ !pip install -q scipy ninja -U
64
+ !pip install -q -U transformers==4.38.0
65
+ ```
66
+
67
+
68
+ ## Version
69
+ ```py
70
+ import torch
71
+ torch.__version__
72
+ #OUTPUTS: ('2.2.0+cu121' )
73
+ ```
74
+
75
+ ## How to use
76
+ ```py
77
+
78
+ from transformers import (
79
+ AutoModelForCausalLM,
80
+ AutoTokenizer,
81
+ BitsAndBytesConfig,
82
+ HfArgumentParser,
83
+ TrainingArguments,
84
+ pipeline,
85
+ logging,
86
+ GenerationConfig,
87
+ TextIteratorStreamer,
88
+ )
89
+
90
+ from transformers import StoppingCriteria, StoppingCriteriaList
91
+
92
+ import torch
93
+
94
+ model_id='NickyNicky/gemma-2b-it_oasst2_chatML_Cluster_1_V1'
95
+
96
+ model = AutoModelForCausalLM.from_pretrained(model_id,
97
+ device_map="auto",
98
+ trust_remote_code=True,
99
+ torch_dtype=torch.bfloat16,
100
+ # load_in_4bit=True,
101
+ # low_cpu_mem_usage= True,
102
+
103
+ )
104
+
105
+ max_length=2055
106
+ print("max_length",max_length)
107
+
108
+
109
+ tokenizer = AutoTokenizer.from_pretrained(model_id,
110
+ # use_fast = False,
111
+ max_length=max_length,)
112
+
113
+
114
+ class ListOfTokensStoppingCriteria(StoppingCriteria):
115
+ """
116
+ Clase para definir un criterio de parada basado en una lista de tokens específicos.
117
+ """
118
+ def __init__(self, tokenizer, stop_tokens):
119
+ self.tokenizer = tokenizer
120
+ # Codifica cada token de parada y guarda sus IDs en una lista
121
+ self.stop_token_ids_list = [tokenizer.encode(stop_token, add_special_tokens=False) for stop_token in stop_tokens]
122
+
123
+ def __call__(self, input_ids, scores, **kwargs):
124
+ # Verifica si los últimos tokens generados coinciden con alguno de los conjuntos de tokens de parada
125
+ for stop_token_ids in self.stop_token_ids_list:
126
+ len_stop_tokens = len(stop_token_ids)
127
+ if len(input_ids[0]) >= len_stop_tokens:
128
+ if input_ids[0, -len_stop_tokens:].tolist() == stop_token_ids:
129
+ return True
130
+ return False
131
+
132
+ # Uso del criterio de parada personalizado
133
+ stop_tokens = ["<end_of_turn>"] # Lista de tokens de parada
134
+
135
+ # Inicializa tu criterio de parada con el tokenizer y la lista de tokens de parada
136
+ stopping_criteria = ListOfTokensStoppingCriteria(tokenizer, stop_tokens)
137
+
138
+ # Añade tu criterio de parada a una StoppingCriteriaList
139
+ stopping_criteria_list = StoppingCriteriaList([stopping_criteria])
140
+
141
+
142
+
143
+ #EXAMPLE #1
144
+ txt="""<bos><start_of_turn>system
145
+ You are a helpful AI assistant.<end_of_turn>
146
+ <start_of_turn>user
147
+ Me dices los diferentes tipos de reciclaje que suelen existir en las ciudades europeas<end_of_turn>
148
+ <start_of_turn>model
149
+ """
150
+
151
+ #EXAMPLE #2
152
+ txt="""<bos><start_of_turn>system
153
+ You are a helpful AI assistant.<end_of_turn>
154
+ <start_of_turn>user
155
+ What is the meaning of life in the current time?<end_of_turn>
156
+ <start_of_turn>model
157
+ """
158
+
159
+ inputs = tokenizer.encode(txt, return_tensors="pt").to("cuda")
160
+
161
+ generation_config = GenerationConfig(
162
+ max_new_tokens=max_new_tokens,
163
+ temperature=0.55,
164
+ #top_p=0.9,
165
+ #top_k=len_tokens,
166
+ repetition_penalty=1.1,
167
+ do_sample=True,
168
+ )
169
+ outputs = model.generate(generation_config=generation_config,
170
+ input_ids=inputs,
171
+ stopping_criteria=stopping_criteria_list,)
172
+ tokenizer.decode(outputs[0], skip_special_tokens=False) #True
173
+ ```