Bulk update: multiple file changes

Browse files

Files changed (4) hide show

README.md +9 -12
config.json +2 -2
tokenization_indictrans.py +1 -3
tokenizer_config.json +2 -2

README.md CHANGED Viewed

@@ -5,7 +5,7 @@ license: mit
 These models are created from their respective IndicTrans2 parent versions by simplying replacing the Sinusoidal Positional Embedding with Rotary Positional Embedding ([Su _et al._](https://arxiv.org/abs/2104.09864)), and finetuning them for further alignment.
 *NOTE*:
-These models are my independent reproduction of the paper: [Towards Inducing Document-Level Abilities in Standard Multilingual Neural Machine Translation Models](https://arxiv.org/abs/2408.11382).
 Detailed information on the data mixture, hyperparameters, and training curriculum can be found in the paper.
@@ -43,8 +43,6 @@ batch = tokenizer(
     batch, padding="longest", truncation=True, max_length=2048, return_tensors="pt"
 ).to(device)
-print(batch)
 with torch.inference_mode():
     outputs = model.generate(
         **batch,
@@ -56,24 +54,23 @@ with torch.inference_mode():
         early_stopping=True
     )
-with tokenizer.as_target_tokenizer():
-    outputs = tokenizer.batch_decode(
-        outputs, skip_special_tokens=True, clean_up_tokenization_spaces=True
-    )
 outputs = ip.postprocess_batch(outputs, lang="eng_Latn")
-print("| > Translations:", outputs[0])
 ```
 # Citation
 If you use these models directly or fine-tune them further for additional use cases, please cite the following work:
 ```bibtex
-@misc{gumma2024inducingdocumentlevelabilitiesstandard,
-      title={Towards Inducing Document-Level Abilities in Standard Multilingual Neural Machine Translation Models},
       author={Varun Gumma and Pranjal A. Chitale and Kalika Bali},
-      year={2024},
       eprint={2408.11382},
       archivePrefix={arXiv},
       primaryClass={cs.CL},

 These models are created from their respective IndicTrans2 parent versions by simplying replacing the Sinusoidal Positional Embedding with Rotary Positional Embedding ([Su _et al._](https://arxiv.org/abs/2104.09864)), and finetuning them for further alignment.
 *NOTE*:
+These models are my independent reproduction of the paper: [Towards Inducing Long-Context Abilities in Multilingual Neural Machine Translation Models](https://arxiv.org/abs/2408.11382).
 Detailed information on the data mixture, hyperparameters, and training curriculum can be found in the paper.
     batch, padding="longest", truncation=True, max_length=2048, return_tensors="pt"
 ).to(device)
 with torch.inference_mode():
     outputs = model.generate(
         **batch,
         early_stopping=True
     )
+# no target_tokenizer scoping is required anymore
+outputs = tokenizer.batch_decode(
+    outputs, skip_special_tokens=True, clean_up_tokenization_spaces=True
+)
 outputs = ip.postprocess_batch(outputs, lang="eng_Latn")
+print(" | > Translations:", outputs[0])
 ```
 # Citation
 If you use these models directly or fine-tune them further for additional use cases, please cite the following work:
 ```bibtex
+@misc{gumma2025inducinglongcontextabilitiesmultilingual,
+      title={Towards Inducing Long-Context Abilities in Multilingual Neural Machine Translation Models},
       author={Varun Gumma and Pranjal A. Chitale and Kalika Bali},
+      year={2025},
       eprint={2408.11382},
       archivePrefix={arXiv},
       primaryClass={cs.CL},

config.json CHANGED Viewed

@@ -15,7 +15,7 @@
   "decoder_normalize_before": true,
   "decoder_start_token_id": 2,
   "decoder_vocab_size": 32296,
-  "dropout": 0.2,
   "encoder_attention_heads": 16,
   "encoder_embed_dim": 1024,
   "encoder_ffn_dim": 8192,
@@ -38,7 +38,7 @@
   "torch_dtype": "float32",
   "transformers_version": "4.46.1",
   "use_cache": true,
-  "_name_or_path": "VarunGumma/rotary-indictrans2-indic-en-1B",
   "auto_map": {
     "AutoConfig": "configuration_rotary_indictrans.RotaryIndicTransConfig",
     "AutoModelForSeq2SeqLM": "modeling_rotary_indictrans.RotaryIndicTransForConditionalGeneration"

   "decoder_normalize_before": true,
   "decoder_start_token_id": 2,
   "decoder_vocab_size": 32296,
+  "dropout": 0.0,
   "encoder_attention_heads": 16,
   "encoder_embed_dim": 1024,
   "encoder_ffn_dim": 8192,
   "torch_dtype": "float32",
   "transformers_version": "4.46.1",
   "use_cache": true,
+  "name_or_path": "prajdabre/rotary-indictrans2-indic-en-1B",
   "auto_map": {
     "AutoConfig": "configuration_rotary_indictrans.RotaryIndicTransConfig",
     "AutoModelForSeq2SeqLM": "modeling_rotary_indictrans.RotaryIndicTransForConditionalGeneration"

tokenization_indictrans.py CHANGED Viewed

@@ -128,7 +128,7 @@ class IndicTransTokenizer(PreTrainedTokenizer):
         super().__init__(
             src_vocab_file=self.src_vocab_fp,
-            tgt_vocab_file=self.src_vocab_fp,
             do_lower_case=do_lower_case,
             unk_token=unk_token,
             bos_token=bos_token,
@@ -190,11 +190,9 @@ class IndicTransTokenizer(PreTrainedTokenizer):
     def vocab_size(self) -> int:
         return self.src_vocab_size
-    @lru_cache(maxsize=10240)
     def _convert_token_to_id(self, token: str) -> int:
         return self.encoder.get(token, self.unk_token_id)
-    @lru_cache(maxsize=10240)
     def _convert_id_to_token(self, index: int) -> str:
         return self.decoder.get(index, self.unk_token)

         super().__init__(
             src_vocab_file=self.src_vocab_fp,
+            tgt_vocab_file=self.tgt_vocab_fp,
             do_lower_case=do_lower_case,
             unk_token=unk_token,
             bos_token=bos_token,
     def vocab_size(self) -> int:
         return self.src_vocab_size
     def _convert_token_to_id(self, token: str) -> int:
         return self.encoder.get(token, self.unk_token_id)
     def _convert_id_to_token(self, index: int) -> str:
         return self.decoder.get(index, self.unk_token)

tokenizer_config.json CHANGED Viewed

@@ -37,9 +37,9 @@
   "clean_up_tokenization_spaces": true,
   "do_lower_case": false,
   "eos_token": "</s>",
-  "model_max_length": 256,
   "pad_token": "<pad>",
-  "name_or_path": "ai4bharat/indictrans2-en-indic-1B",
   "tokenizer_class": "IndicTransTokenizer",
   "auto_map": {
     "AutoTokenizer": [

   "clean_up_tokenization_spaces": true,
   "do_lower_case": false,
   "eos_token": "</s>",
+  "model_max_length": 4096,
   "pad_token": "<pad>",
+  "name_or_path": "prajdabre/rotary-indictrans2-indic-en-1B",
   "tokenizer_class": "IndicTransTokenizer",
   "auto_map": {
     "AutoTokenizer": [