File size: 6,294 Bytes
270002f 4771bfa 103b9c8 4771bfa 103b9c8 4771bfa 103b9c8 270002f 4771bfa 5e4de89 4771bfa 5e4de89 4771bfa 5e4de89 4771bfa 5e4de89 4771bfa 5e4de89 4771bfa 5e4de89 4771bfa 5e4de89 4771bfa 5e4de89 4771bfa 5e4de89 4771bfa |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 |
---
language:
- 'no'
- nb
- nn
- en
inference: false
tags:
- Norwegian
- English
- translation
license: cc-by-4.0
pipeline_tag: translation
---
# NorT5 large finetuned for English → Norwegian (Bokmål or Nynorsk) translation
<img src="https://huggingface.co/ltg/norbert3-base/resolve/main/norbert.png" width=12.5%>
## Example usage
This model is specifically finetuned for translating documents in the English-to-Norwegian direction.
Unlike traditional NMT models, it is trained on paragraph-to-paragraph translation – the translation quality is thus better if you feed it whole paragraphs instead of segmented sentences.
A simple example of how to use this model can be found in the `translate.py` file:
```python
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers.generation import LogitsProcessor
class RepetitionPenaltyLogitsProcessor(LogitsProcessor):
def __init__(self, penalty: float, model):
last_bias = model.classifier.nonlinearity[-1].bias.data
last_bias = torch.nn.functional.log_softmax(last_bias)
self.penalty = penalty * (last_bias - last_bias.max())
def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
penalized_score = torch.gather(scores + self.penalty.unsqueeze(0).to(input_ids.device), 1, input_ids).to(scores.dtype)
scores.scatter_(1, input_ids, penalized_score)
return scores
class Translator:
def __init__(self, model_path="ltg/nort5-large-en-no-translation", device="cpu"):
self.tokenizer = AutoTokenizer.from_pretrained(model_path)
self.cls_index = self.tokenizer.convert_tokens_to_ids("[CLS]")
self.sep_index = self.tokenizer.convert_tokens_to_ids("[SEP]")
self.eos_index = self.tokenizer.convert_tokens_to_ids("[EOS]")
self.pad_index = self.tokenizer.convert_tokens_to_ids("[PAD]")
self.eng_index = self.tokenizer.convert_tokens_to_ids(">>eng<<")
self.nob_index = self.tokenizer.convert_tokens_to_ids(">>nob<<")
self.nno_index = self.tokenizer.convert_tokens_to_ids(">>nno<<")
self.model = AutoModelForSeq2SeqLM.from_pretrained(model_path, trust_remote_code=True)
self.device = device
print(f"SYSTEM: Running on {self.device}", flush=True)
self.model = self.model.to(device)
self.model.eval()
print(f"Sucessfully loaded the model to the memory")
self.LANGUAGE_IDS = {
"en": self.eng_index,
"nb": self.nob_index,
"nn": self.nno_index
}
def __call__(self, source, source_language, target_language):
source = [s.strip() for s in source.split('\n')]
source_subwords = self.tokenizer(source).input_ids
source_subwords = [[self.cls_index, self.LANGUAGE_IDS[target_language], self.LANGUAGE_IDS[source_language]] + s + [self.sep_index] for s in source_subwords]
source_subwords = [torch.tensor(s) for s in source_subwords]
source_subwords = torch.nn.utils.rnn.pad_sequence(source_subwords, batch_first=True, padding_value=self.pad_index)
source_subwords = source_subwords[:, :512].to(self.device)
def generate(model, **kwargs):
with torch.inference_mode():
with torch.autocast(enabled=self.device != "cpu", device_type="cuda", dtype=torch.bfloat16):
return model.generate(**kwargs)
generate_kwargs = dict(
input_ids=source_subwords,
attention_mask=(source_subwords != self.pad_index).long(),
max_new_tokens = 512-1,
num_beams=8,
length_penalty=1.6,
early_stopping=True,
do_sample=False,
use_cache=True,
logits_processor=[RepetitionPenaltyLogitsProcessor(0.5, self.model), transformers.LogitNormalization()]
)
output = generate(self.model, **generate_kwargs).tolist()
paragraphs = [self.tokenizer.decode(c, skip_special_tokens=True).strip() for c in output]
translation = '\n'.join(paragraphs)
return translation
if __name__ == "__main__":
translator = Translator()
en_text = "How are you feeling right now? Better?"
no_text = translator(en_text, "en", "nb")
print(en_text)
print(no_text)
```
## The NorT5 and NorBERT family
The official release of a new generation of NorT5 language models described in paper [**NorBench — A Benchmark for Norwegian Language Models**](https://arxiv.org/abs/2305.03880). Plese read the paper to learn more details about the model.
## Other sizes:
- [NorT5 xs (32M)](https://huggingface.co/ltg/nort5-xs)
- [NorT5 small (88M)](https://huggingface.co/ltg/nort5-small)
- [NorT5 base (228M)](https://huggingface.co/ltg/nort5-base)
- [NorT5 large (808M)](https://huggingface.co/ltg/nort5-large)
## Encoder-only NorBERT siblings:
- [NorBERT 3 xs (15M)](https://huggingface.co/ltg/norbert3-xs)
- [NorBERT 3 small (40M)](https://huggingface.co/ltg/norbert3-small)
- [NorBERT 3 base (123M)](https://huggingface.co/ltg/norbert3-base)
- [NorBERT 3 large (323M)](https://huggingface.co/ltg/norbert3-large)
## Cite us
```bibtex
@inproceedings{samuel-etal-2023-norbench,
title = "{N}or{B}ench {--} A Benchmark for {N}orwegian Language Models",
author = "Samuel, David and
Kutuzov, Andrey and
Touileb, Samia and
Velldal, Erik and
{\O}vrelid, Lilja and
R{\o}nningstad, Egil and
Sigdel, Elina and
Palatkina, Anna",
booktitle = "Proceedings of the 24th Nordic Conference on Computational Linguistics (NoDaLiDa)",
month = may,
year = "2023",
address = "T{\'o}rshavn, Faroe Islands",
publisher = "University of Tartu Library",
url = "https://aclanthology.org/2023.nodalida-1.61",
pages = "618--633",
abstract = "We present NorBench: a streamlined suite of NLP tasks and probes for evaluating Norwegian language models (LMs) on standardized data splits and evaluation metrics. We also introduce a range of new Norwegian language models (both encoder and encoder-decoder based). Finally, we compare and analyze their performance, along with other existing LMs, across the different benchmark tests of NorBench.",
}
``` |