Bulk update: multiple file changes
Browse files- README.md +9 -12
- config.json +2 -2
- tokenization_indictrans.py +1 -3
- tokenizer_config.json +2 -2
README.md
CHANGED
@@ -5,7 +5,7 @@ license: mit
|
|
5 |
These models are created from their respective IndicTrans2 parent versions by simplying replacing the Sinusoidal Positional Embedding with Rotary Positional Embedding ([Su _et al._](https://arxiv.org/abs/2104.09864)), and finetuning them for further alignment.
|
6 |
|
7 |
*NOTE*:
|
8 |
-
These models are my independent reproduction of the paper: [Towards Inducing
|
9 |
|
10 |
Detailed information on the data mixture, hyperparameters, and training curriculum can be found in the paper.
|
11 |
|
@@ -43,8 +43,6 @@ batch = tokenizer(
|
|
43 |
batch, padding="longest", truncation=True, max_length=2048, return_tensors="pt"
|
44 |
).to(device)
|
45 |
|
46 |
-
print(batch)
|
47 |
-
|
48 |
with torch.inference_mode():
|
49 |
outputs = model.generate(
|
50 |
**batch,
|
@@ -56,24 +54,23 @@ with torch.inference_mode():
|
|
56 |
early_stopping=True
|
57 |
)
|
58 |
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
|
64 |
outputs = ip.postprocess_batch(outputs, lang="eng_Latn")
|
65 |
-
print("| > Translations:", outputs[0])
|
66 |
-
|
67 |
```
|
68 |
|
69 |
# Citation
|
70 |
If you use these models directly or fine-tune them further for additional use cases, please cite the following work:
|
71 |
|
72 |
```bibtex
|
73 |
-
@misc{
|
74 |
-
title={Towards Inducing
|
75 |
author={Varun Gumma and Pranjal A. Chitale and Kalika Bali},
|
76 |
-
year={
|
77 |
eprint={2408.11382},
|
78 |
archivePrefix={arXiv},
|
79 |
primaryClass={cs.CL},
|
|
|
5 |
These models are created from their respective IndicTrans2 parent versions by simplying replacing the Sinusoidal Positional Embedding with Rotary Positional Embedding ([Su _et al._](https://arxiv.org/abs/2104.09864)), and finetuning them for further alignment.
|
6 |
|
7 |
*NOTE*:
|
8 |
+
These models are my independent reproduction of the paper: [Towards Inducing Long-Context Abilities in Multilingual Neural Machine Translation Models](https://arxiv.org/abs/2408.11382).
|
9 |
|
10 |
Detailed information on the data mixture, hyperparameters, and training curriculum can be found in the paper.
|
11 |
|
|
|
43 |
batch, padding="longest", truncation=True, max_length=2048, return_tensors="pt"
|
44 |
).to(device)
|
45 |
|
|
|
|
|
46 |
with torch.inference_mode():
|
47 |
outputs = model.generate(
|
48 |
**batch,
|
|
|
54 |
early_stopping=True
|
55 |
)
|
56 |
|
57 |
+
# no target_tokenizer scoping is required anymore
|
58 |
+
outputs = tokenizer.batch_decode(
|
59 |
+
outputs, skip_special_tokens=True, clean_up_tokenization_spaces=True
|
60 |
+
)
|
61 |
|
62 |
outputs = ip.postprocess_batch(outputs, lang="eng_Latn")
|
63 |
+
print(" | > Translations:", outputs[0])
|
|
|
64 |
```
|
65 |
|
66 |
# Citation
|
67 |
If you use these models directly or fine-tune them further for additional use cases, please cite the following work:
|
68 |
|
69 |
```bibtex
|
70 |
+
@misc{gumma2025inducinglongcontextabilitiesmultilingual,
|
71 |
+
title={Towards Inducing Long-Context Abilities in Multilingual Neural Machine Translation Models},
|
72 |
author={Varun Gumma and Pranjal A. Chitale and Kalika Bali},
|
73 |
+
year={2025},
|
74 |
eprint={2408.11382},
|
75 |
archivePrefix={arXiv},
|
76 |
primaryClass={cs.CL},
|
config.json
CHANGED
@@ -15,7 +15,7 @@
|
|
15 |
"decoder_normalize_before": true,
|
16 |
"decoder_start_token_id": 2,
|
17 |
"decoder_vocab_size": 32296,
|
18 |
-
"dropout": 0.
|
19 |
"encoder_attention_heads": 16,
|
20 |
"encoder_embed_dim": 1024,
|
21 |
"encoder_ffn_dim": 8192,
|
@@ -38,7 +38,7 @@
|
|
38 |
"torch_dtype": "float32",
|
39 |
"transformers_version": "4.46.1",
|
40 |
"use_cache": true,
|
41 |
-
"
|
42 |
"auto_map": {
|
43 |
"AutoConfig": "configuration_rotary_indictrans.RotaryIndicTransConfig",
|
44 |
"AutoModelForSeq2SeqLM": "modeling_rotary_indictrans.RotaryIndicTransForConditionalGeneration"
|
|
|
15 |
"decoder_normalize_before": true,
|
16 |
"decoder_start_token_id": 2,
|
17 |
"decoder_vocab_size": 32296,
|
18 |
+
"dropout": 0.0,
|
19 |
"encoder_attention_heads": 16,
|
20 |
"encoder_embed_dim": 1024,
|
21 |
"encoder_ffn_dim": 8192,
|
|
|
38 |
"torch_dtype": "float32",
|
39 |
"transformers_version": "4.46.1",
|
40 |
"use_cache": true,
|
41 |
+
"name_or_path": "prajdabre/rotary-indictrans2-indic-en-1B",
|
42 |
"auto_map": {
|
43 |
"AutoConfig": "configuration_rotary_indictrans.RotaryIndicTransConfig",
|
44 |
"AutoModelForSeq2SeqLM": "modeling_rotary_indictrans.RotaryIndicTransForConditionalGeneration"
|
tokenization_indictrans.py
CHANGED
@@ -128,7 +128,7 @@ class IndicTransTokenizer(PreTrainedTokenizer):
|
|
128 |
|
129 |
super().__init__(
|
130 |
src_vocab_file=self.src_vocab_fp,
|
131 |
-
tgt_vocab_file=self.
|
132 |
do_lower_case=do_lower_case,
|
133 |
unk_token=unk_token,
|
134 |
bos_token=bos_token,
|
@@ -190,11 +190,9 @@ class IndicTransTokenizer(PreTrainedTokenizer):
|
|
190 |
def vocab_size(self) -> int:
|
191 |
return self.src_vocab_size
|
192 |
|
193 |
-
@lru_cache(maxsize=10240)
|
194 |
def _convert_token_to_id(self, token: str) -> int:
|
195 |
return self.encoder.get(token, self.unk_token_id)
|
196 |
|
197 |
-
@lru_cache(maxsize=10240)
|
198 |
def _convert_id_to_token(self, index: int) -> str:
|
199 |
return self.decoder.get(index, self.unk_token)
|
200 |
|
|
|
128 |
|
129 |
super().__init__(
|
130 |
src_vocab_file=self.src_vocab_fp,
|
131 |
+
tgt_vocab_file=self.tgt_vocab_fp,
|
132 |
do_lower_case=do_lower_case,
|
133 |
unk_token=unk_token,
|
134 |
bos_token=bos_token,
|
|
|
190 |
def vocab_size(self) -> int:
|
191 |
return self.src_vocab_size
|
192 |
|
|
|
193 |
def _convert_token_to_id(self, token: str) -> int:
|
194 |
return self.encoder.get(token, self.unk_token_id)
|
195 |
|
|
|
196 |
def _convert_id_to_token(self, index: int) -> str:
|
197 |
return self.decoder.get(index, self.unk_token)
|
198 |
|
tokenizer_config.json
CHANGED
@@ -37,9 +37,9 @@
|
|
37 |
"clean_up_tokenization_spaces": true,
|
38 |
"do_lower_case": false,
|
39 |
"eos_token": "</s>",
|
40 |
-
"model_max_length":
|
41 |
"pad_token": "<pad>",
|
42 |
-
"name_or_path": "
|
43 |
"tokenizer_class": "IndicTransTokenizer",
|
44 |
"auto_map": {
|
45 |
"AutoTokenizer": [
|
|
|
37 |
"clean_up_tokenization_spaces": true,
|
38 |
"do_lower_case": false,
|
39 |
"eos_token": "</s>",
|
40 |
+
"model_max_length": 4096,
|
41 |
"pad_token": "<pad>",
|
42 |
+
"name_or_path": "prajdabre/rotary-indictrans2-indic-en-1B",
|
43 |
"tokenizer_class": "IndicTransTokenizer",
|
44 |
"auto_map": {
|
45 |
"AutoTokenizer": [
|