|
--- |
|
license: apache-2.0 |
|
datasets: |
|
- booba-uz/translation-dataset-250k |
|
language: |
|
- en |
|
- uz |
|
metrics: |
|
- bleu 35 |
|
base_model: |
|
- facebook/nllb-200-distilled-600M |
|
pipeline_tag: translation |
|
library_name: transformers |
|
--- |
|
|
|
|
|
# model usage: |
|
|
|
|
|
``` |
|
|
|
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM |
|
|
|
model_name = 'booba-uz/english-uzbek-translation_v2' |
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
model = AutoModelForSeq2SeqLM.from_pretrained(model_name) |
|
|
|
tokenizer.src_lang = "en" |
|
tokenizer.tgt_lang = "uz" |
|
prefix = "Translate this text from English to uzbek: " |
|
|
|
# Function to translate text |
|
def translate_text(text: str, target_lang: str = 'uz'): |
|
|
|
text = prefix + text |
|
inputs = tokenizer.encode(text, return_tensors="pt", padding=True) |
|
translated = model.generate(inputs, num_beams=5, max_length=200, early_stopping=True) |
|
translated_text = tokenizer.decode(translated[0], skip_special_tokens=True) |
|
return translated_text |
|
|
|
input_text = "An Azerbaijan Airlines Embraer ERJ-190AR aircraft crashed at Aktau Airport in Kazakhstan while attempting an emergency landing. The plane, registered as 4K-AZ65, was carrying 67 passengers and five crew members at the time. Some media reports suggest that the number of passengers exceeded 100, with over 60 identified as Russian citizens." |
|
|
|
# Translate the input text to Uzbek |
|
output_text = translate_text(input_text) |
|
print("Translated text:", output_text) |
|
|
|
|
|
``` |