hectordiazgomez commited on
Commit
16a087b
1 Parent(s): 1b7de95

first description

Browse files
Files changed (1) hide show
  1. README.md +73 -0
README.md ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - spa
4
+ - agr
5
+ tags:
6
+ - translation
7
+ - nllb
8
+ - aguaruna
9
+ - spanish
10
+ license: apache-2.0
11
+ ---
12
+
13
+ # NLLB-200-600M Fine-tuned for Aguaruna-Spanish Translation
14
+
15
+ This is a **NLLB-200-600M** model fine-tuned for translating between Aguaruna and Spanish languages.
16
+
17
+ ## How to use the model:
18
+
19
+ ```python
20
+ !pip install sentencepiece transformers==4.33
21
+ import torch
22
+ from transformers import NllbTokenizer, AutoModelForSeq2SeqLM
23
+
24
+ def fix_tokenizer(tokenizer, new_lang='agr_Latn'):
25
+ old_len = len(tokenizer) - int(new_lang in tokenizer.added_tokens_encoder)
26
+ tokenizer.lang_code_to_id[new_lang] = old_len-1
27
+ tokenizer.id_to_lang_code[old_len-1] = new_lang
28
+ tokenizer.fairseq_tokens_to_ids["<mask>"] = len(tokenizer.sp_model) + len(tokenizer.lang_code_to_id) + tokenizer.fairseq_offset
29
+
30
+ tokenizer.fairseq_tokens_to_ids.update(tokenizer.lang_code_to_id)
31
+ tokenizer.fairseq_ids_to_tokens = {v: k for k, v in tokenizer.fairseq_tokens_to_ids.items()}
32
+ if new_lang not in tokenizer._additional_special_tokens:
33
+ tokenizer._additional_special_tokens.append(new_lang)
34
+ tokenizer.added_tokens_encoder = {}
35
+ tokenizer.added_tokens_decoder = {}
36
+
37
+ MODEL_URL = "hectordiazgomez/nllb-spa-awa-v3"
38
+ model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_URL)
39
+ tokenizer = NllbTokenizer.from_pretrained(MODEL_URL)
40
+ fix_tokenizer(tokenizer)
41
+
42
+ def translate(
43
+ text,
44
+ model,
45
+ tokenizer,
46
+ src_lang='agr_Latn',
47
+ tgt_lang='spa_Latn',
48
+ max_length='auto',
49
+ num_beams=4,
50
+ n_out=None,
51
+ **kwargs
52
+ ):
53
+ tokenizer.src_lang = src_lang
54
+ encoded = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
55
+ if max_length == 'auto':
56
+ max_length = int(32 + 2.0 * encoded.input_ids.shape[1])
57
+ model.eval()
58
+ generated_tokens = model.generate(
59
+ **encoded.to(model.device),
60
+ forced_bos_token_id=tokenizer.lang_code_to_id[tgt_lang],
61
+ max_length=max_length,
62
+ num_beams=num_beams,
63
+ num_return_sequences=n_out or 1,
64
+ **kwargs
65
+ )
66
+ out = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
67
+ if isinstance(text, str) and n_out is None:
68
+ return out[0]
69
+ return
70
+
71
+ translate("Uchi piipichi buuke baejai.", model=model, tokenizer=tokenizer)
72
+ # El niño se quedo con el pelo.
73
+ ```