NickyNicky commited on
Commit
ded0f4c
1 Parent(s): 623806c

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +206 -0
README.md CHANGED
@@ -1,3 +1,209 @@
1
 
2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  ![image/png](https://cdn-uploads.huggingface.co/production/uploads/641b435ba5f876fe30c5ae0a/oCI0UQjJ1Rux2JXUHtEZB.png)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
 
2
 
3
+
4
+
5
+ ---
6
+ license: apache-2.0
7
+ datasets:
8
+ - NickyNicky/aya_dataset_multilingual_chatml_gemma_response_json
9
+ model:
10
+ - NickyNicky/gemma-2b-it_oasst2_chatML_Cluster_2_V1
11
+ language:
12
+ - bg
13
+ - ca
14
+ - cs
15
+ - da
16
+ - de
17
+ - en
18
+ - es
19
+ - fr
20
+ - hr
21
+ - hu
22
+ - it
23
+ - nl
24
+ - pl
25
+ - pt
26
+ - ro
27
+ - ru
28
+ - sl
29
+ - sr
30
+ - sv
31
+ - uk
32
+
33
+ library_name: transformers
34
+
35
+ widget:
36
+ - text: |
37
+ <bos><start_of_turn>system
38
+ You are a helpful AI assistant.
39
+ solo responde en formato json.
40
+ lista de codigos linguisticos disponibles: ["es", "en", "fr", "de"].<end_of_turn>
41
+ <start_of_turn>user
42
+ {
43
+ "input": "fr",
44
+ "targets": "es",
45
+ "inputs_fr": "Quels président des États-Unis ne s’est jamais marié ?",
46
+ }<end_of_turn>
47
+ <start_of_turn>model\n
48
+
49
+ ---
50
+
51
+
52
+ ![image/png](https://cdn-uploads.huggingface.co/production/uploads/641b435ba5f876fe30c5ae0a/YXqUXFjX8uIJT-mdOnM1h.png)
53
+
54
+ ```
55
+ reference data model:
56
+
57
+ datasets:
58
+ - lang: "bg,ca,cs,da,de,en,es,fr,hr,hu,it,nl,pl,pt,ro,ru,sl,sr,sv,uk"
59
+ link: https://huggingface.co/datasets/NickyNicky/oasst2_clusters
60
+
61
+ model:
62
+ - google/gemma-2b-it
63
+ Link:
64
+ https://huggingface.co/google/gemma-2b-it
65
+
66
+ base fine tune: NickyNicky/gemma-2b-it_oasst2_chatML_Cluster_2_V1
67
+
68
+ Epoch: 2.5
69
+
70
+ future experts: 5
71
+
72
+ Eval model:
73
+ - link:
74
+ soon
75
+
76
+ ```
77
+
78
+
79
+ ## train/loss 0.2377
80
+
81
  ![image/png](https://cdn-uploads.huggingface.co/production/uploads/641b435ba5f876fe30c5ae0a/oCI0UQjJ1Rux2JXUHtEZB.png)
82
+
83
+
84
+ ##
85
+
86
+
87
+ ```Python
88
+ !python -m pip install --upgrade pip
89
+ !pip install "torch>=2.1.1" -U
90
+ !pip install torchaudio==2.2.0
91
+ !pip install -q datasets trl peft bitsandbytes sentencepiece wandb
92
+ !pip install -q accelerate safetensors deepspeed
93
+ !pip install -q scipy ninja -U
94
+ !pip install -q -U transformers==4.38.0
95
+ ```
96
+
97
+
98
+ ## Version
99
+ ```py
100
+ import torch
101
+ torch.__version__
102
+ #OUTPUTS: ('2.2.0+cu121' )
103
+ ```
104
+
105
+ ## How to use
106
+ ```py
107
+
108
+ from transformers import (
109
+ AutoModelForCausalLM,
110
+ AutoTokenizer,
111
+ BitsAndBytesConfig,
112
+ HfArgumentParser,
113
+ TrainingArguments,
114
+ pipeline,
115
+ logging,
116
+ GenerationConfig,
117
+ TextIteratorStreamer,
118
+ )
119
+
120
+ from transformers import StoppingCriteria, StoppingCriteriaList
121
+
122
+ import torch
123
+
124
+ model_id='NickyNicky/gemma-2b-it_oasst2_Cluster_2_aya_dataset_multilingual_chatml_response_json_V1'
125
+
126
+ model = AutoModelForCausalLM.from_pretrained(model_id,
127
+ device_map="auto",
128
+ trust_remote_code=True,
129
+ torch_dtype=torch.bfloat16,
130
+ attn_implementation="flash_attention_2",
131
+ # load_in_4bit=True,
132
+ # low_cpu_mem_usage= True,
133
+
134
+ )
135
+
136
+ max_length=1055
137
+ print("max_length",max_length)
138
+
139
+
140
+ tokenizer = AutoTokenizer.from_pretrained(model_id,
141
+ # use_fast = False,
142
+ max_length=max_length,)
143
+
144
+
145
+ class ListOfTokensStoppingCriteria(StoppingCriteria):
146
+ """
147
+ Clase para definir un criterio de parada basado en una lista de tokens específicos.
148
+ """
149
+ def __init__(self, tokenizer, stop_tokens):
150
+ self.tokenizer = tokenizer
151
+ # Codifica cada token de parada y guarda sus IDs en una lista
152
+ self.stop_token_ids_list = [tokenizer.encode(stop_token, add_special_tokens=False) for stop_token in stop_tokens]
153
+
154
+ def __call__(self, input_ids, scores, **kwargs):
155
+ # Verifica si los últimos tokens generados coinciden con alguno de los conjuntos de tokens de parada
156
+ for stop_token_ids in self.stop_token_ids_list:
157
+ len_stop_tokens = len(stop_token_ids)
158
+ if len(input_ids[0]) >= len_stop_tokens:
159
+ if input_ids[0, -len_stop_tokens:].tolist() == stop_token_ids:
160
+ return True
161
+ return False
162
+
163
+ # Uso del criterio de parada personalizado
164
+ stop_tokens = ["<end_of_turn>"] # Lista de tokens de parada
165
+
166
+ # Inicializa tu criterio de parada con el tokenizer y la lista de tokens de parada
167
+ stopping_criteria = ListOfTokensStoppingCriteria(tokenizer, stop_tokens)
168
+
169
+ # Añade tu criterio de parada a una StoppingCriteriaList
170
+ stopping_criteria_list = StoppingCriteriaList([stopping_criteria])
171
+
172
+
173
+
174
+
175
+ #EXAMPLE #1
176
+ input_text = """James Buchanan es el único presidente que nunca se casó.”"""
177
+ targets_traslate= "en"
178
+
179
+ txt=f"""<bos><start_of_turn>system
180
+ You are a helpful AI assistant.
181
+ solo responde en formato json.
182
+ lista de codigos linguisticos disponibles: ["es", "en", "fr", "de"].<end_of_turn>
183
+ <start_of_turn>user
184
+ {{
185
+ "input": "es",
186
+ "targets": "{targets_traslate}",
187
+ "inputs_es": "{input_text}",
188
+ }}<end_of_turn>
189
+ <start_of_turn>model
190
+ """
191
+
192
+
193
+ inputs = tokenizer.encode(txt,
194
+ return_tensors="pt",
195
+ add_special_tokens=False).to("cuda:0")
196
+ max_new_tokens=1000
197
+ generation_config = GenerationConfig(
198
+ max_new_tokens=max_new_tokens,
199
+ temperature=0.55,
200
+ #top_p=0.9,
201
+ #top_k=len_tokens,
202
+ repetition_penalty=1.1,
203
+ do_sample=True,
204
+ )
205
+ outputs = model.generate(generation_config=generation_config,
206
+ input_ids=inputs,
207
+ stopping_criteria=stopping_criteria_list,)
208
+ tokenizer.decode(outputs[0], skip_special_tokens=False) #True
209
+ ```