prajdabre commited on
Commit
5f70327
·
verified ·
1 Parent(s): 4372577

Bulk update: multiple file changes

Browse files
Files changed (4) hide show
  1. README.md +9 -12
  2. config.json +2 -2
  3. tokenization_indictrans.py +1 -3
  4. tokenizer_config.json +2 -2
README.md CHANGED
@@ -5,7 +5,7 @@ license: mit
5
  These models are created from their respective IndicTrans2 parent versions by simplying replacing the Sinusoidal Positional Embedding with Rotary Positional Embedding ([Su _et al._](https://arxiv.org/abs/2104.09864)), and finetuning them for further alignment.
6
 
7
  *NOTE*:
8
- These models are my independent reproduction of the paper: [Towards Inducing Document-Level Abilities in Standard Multilingual Neural Machine Translation Models](https://arxiv.org/abs/2408.11382).
9
 
10
  Detailed information on the data mixture, hyperparameters, and training curriculum can be found in the paper.
11
 
@@ -43,8 +43,6 @@ batch = tokenizer(
43
  batch, padding="longest", truncation=True, max_length=2048, return_tensors="pt"
44
  ).to(device)
45
 
46
- print(batch)
47
-
48
  with torch.inference_mode():
49
  outputs = model.generate(
50
  **batch,
@@ -56,24 +54,23 @@ with torch.inference_mode():
56
  early_stopping=True
57
  )
58
 
59
- with tokenizer.as_target_tokenizer():
60
- outputs = tokenizer.batch_decode(
61
- outputs, skip_special_tokens=True, clean_up_tokenization_spaces=True
62
- )
63
 
64
  outputs = ip.postprocess_batch(outputs, lang="eng_Latn")
65
- print("| > Translations:", outputs[0])
66
-
67
  ```
68
 
69
  # Citation
70
  If you use these models directly or fine-tune them further for additional use cases, please cite the following work:
71
 
72
  ```bibtex
73
- @misc{gumma2024inducingdocumentlevelabilitiesstandard,
74
- title={Towards Inducing Document-Level Abilities in Standard Multilingual Neural Machine Translation Models},
75
  author={Varun Gumma and Pranjal A. Chitale and Kalika Bali},
76
- year={2024},
77
  eprint={2408.11382},
78
  archivePrefix={arXiv},
79
  primaryClass={cs.CL},
 
5
  These models are created from their respective IndicTrans2 parent versions by simplying replacing the Sinusoidal Positional Embedding with Rotary Positional Embedding ([Su _et al._](https://arxiv.org/abs/2104.09864)), and finetuning them for further alignment.
6
 
7
  *NOTE*:
8
+ These models are my independent reproduction of the paper: [Towards Inducing Long-Context Abilities in Multilingual Neural Machine Translation Models](https://arxiv.org/abs/2408.11382).
9
 
10
  Detailed information on the data mixture, hyperparameters, and training curriculum can be found in the paper.
11
 
 
43
  batch, padding="longest", truncation=True, max_length=2048, return_tensors="pt"
44
  ).to(device)
45
 
 
 
46
  with torch.inference_mode():
47
  outputs = model.generate(
48
  **batch,
 
54
  early_stopping=True
55
  )
56
 
57
+ # no target_tokenizer scoping is required anymore
58
+ outputs = tokenizer.batch_decode(
59
+ outputs, skip_special_tokens=True, clean_up_tokenization_spaces=True
60
+ )
61
 
62
  outputs = ip.postprocess_batch(outputs, lang="eng_Latn")
63
+ print(" | > Translations:", outputs[0])
 
64
  ```
65
 
66
  # Citation
67
  If you use these models directly or fine-tune them further for additional use cases, please cite the following work:
68
 
69
  ```bibtex
70
+ @misc{gumma2025inducinglongcontextabilitiesmultilingual,
71
+ title={Towards Inducing Long-Context Abilities in Multilingual Neural Machine Translation Models},
72
  author={Varun Gumma and Pranjal A. Chitale and Kalika Bali},
73
+ year={2025},
74
  eprint={2408.11382},
75
  archivePrefix={arXiv},
76
  primaryClass={cs.CL},
config.json CHANGED
@@ -15,7 +15,7 @@
15
  "decoder_normalize_before": true,
16
  "decoder_start_token_id": 2,
17
  "decoder_vocab_size": 32296,
18
- "dropout": 0.2,
19
  "encoder_attention_heads": 16,
20
  "encoder_embed_dim": 1024,
21
  "encoder_ffn_dim": 8192,
@@ -38,7 +38,7 @@
38
  "torch_dtype": "float32",
39
  "transformers_version": "4.46.1",
40
  "use_cache": true,
41
- "_name_or_path": "VarunGumma/rotary-indictrans2-indic-en-1B",
42
  "auto_map": {
43
  "AutoConfig": "configuration_rotary_indictrans.RotaryIndicTransConfig",
44
  "AutoModelForSeq2SeqLM": "modeling_rotary_indictrans.RotaryIndicTransForConditionalGeneration"
 
15
  "decoder_normalize_before": true,
16
  "decoder_start_token_id": 2,
17
  "decoder_vocab_size": 32296,
18
+ "dropout": 0.0,
19
  "encoder_attention_heads": 16,
20
  "encoder_embed_dim": 1024,
21
  "encoder_ffn_dim": 8192,
 
38
  "torch_dtype": "float32",
39
  "transformers_version": "4.46.1",
40
  "use_cache": true,
41
+ "name_or_path": "prajdabre/rotary-indictrans2-indic-en-1B",
42
  "auto_map": {
43
  "AutoConfig": "configuration_rotary_indictrans.RotaryIndicTransConfig",
44
  "AutoModelForSeq2SeqLM": "modeling_rotary_indictrans.RotaryIndicTransForConditionalGeneration"
tokenization_indictrans.py CHANGED
@@ -128,7 +128,7 @@ class IndicTransTokenizer(PreTrainedTokenizer):
128
 
129
  super().__init__(
130
  src_vocab_file=self.src_vocab_fp,
131
- tgt_vocab_file=self.src_vocab_fp,
132
  do_lower_case=do_lower_case,
133
  unk_token=unk_token,
134
  bos_token=bos_token,
@@ -190,11 +190,9 @@ class IndicTransTokenizer(PreTrainedTokenizer):
190
  def vocab_size(self) -> int:
191
  return self.src_vocab_size
192
 
193
- @lru_cache(maxsize=10240)
194
  def _convert_token_to_id(self, token: str) -> int:
195
  return self.encoder.get(token, self.unk_token_id)
196
 
197
- @lru_cache(maxsize=10240)
198
  def _convert_id_to_token(self, index: int) -> str:
199
  return self.decoder.get(index, self.unk_token)
200
 
 
128
 
129
  super().__init__(
130
  src_vocab_file=self.src_vocab_fp,
131
+ tgt_vocab_file=self.tgt_vocab_fp,
132
  do_lower_case=do_lower_case,
133
  unk_token=unk_token,
134
  bos_token=bos_token,
 
190
  def vocab_size(self) -> int:
191
  return self.src_vocab_size
192
 
 
193
  def _convert_token_to_id(self, token: str) -> int:
194
  return self.encoder.get(token, self.unk_token_id)
195
 
 
196
  def _convert_id_to_token(self, index: int) -> str:
197
  return self.decoder.get(index, self.unk_token)
198
 
tokenizer_config.json CHANGED
@@ -37,9 +37,9 @@
37
  "clean_up_tokenization_spaces": true,
38
  "do_lower_case": false,
39
  "eos_token": "</s>",
40
- "model_max_length": 256,
41
  "pad_token": "<pad>",
42
- "name_or_path": "ai4bharat/indictrans2-en-indic-1B",
43
  "tokenizer_class": "IndicTransTokenizer",
44
  "auto_map": {
45
  "AutoTokenizer": [
 
37
  "clean_up_tokenization_spaces": true,
38
  "do_lower_case": false,
39
  "eos_token": "</s>",
40
+ "model_max_length": 4096,
41
  "pad_token": "<pad>",
42
+ "name_or_path": "prajdabre/rotary-indictrans2-indic-en-1B",
43
  "tokenizer_class": "IndicTransTokenizer",
44
  "auto_map": {
45
  "AutoTokenizer": [