File size: 8,091 Bytes
bd9805e
a4d0b27
577fc77
78b7b89
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a4d0b27
577fc77
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bf34975
577fc77
 
a4d0b27
577fc77
 
 
78b7b89
577fc77
 
78b7b89
577fc77
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f751f4e
577fc77
 
78b7b89
bcc6133
 
 
577fc77
bcc6133
577fc77
bcc6133
 
78b7b89
bcc6133
 
 
 
 
 
 
 
 
 
 
 
 
 
577fc77
59eb871
bcc6133
 
 
59eb871
bcc6133
 
 
 
f751f4e
bcc6133
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59eb871
 
 
dfd6c72
 
ac0fe1d
59eb871
 
 
 
577fc77
 
 
 
 
 
 
 
 
 
 
 
 
59eb871
f751f4e
59eb871
 
 
 
 
 
a4d0b27
bd9805e
3e2e722
bd9805e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
import gradio as gr

############### VANILLA INFERENCE ###############
# from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# model_path = "anzorq/m2m100_418M_ft_ru-kbd_44K"  
# src_lang="ru" 
# tgt_lang="zu"

# # tokenizer = AutoTokenizer.from_pretrained(model_path, src_lang=src_lang)
# tokenizer = AutoTokenizer.from_pretrained(model_path)
# model = AutoModelForSeq2SeqLM.from_pretrained(model_path, use_safetensors=True)#, load_in_4bit=True, device_map="auto")
# model.to_bettertransformer()

# def translate(text, num_beams=4, num_return_sequences=4):
#   inputs = tokenizer(text, return_tensors="pt")

#   num_return_sequences = min(num_return_sequences, num_beams)

#   translated_tokens = model.generate(
#       **inputs, forced_bos_token_id=tokenizer.lang_code_to_id[tgt_lang], num_beams=num_beams, num_return_sequences=num_return_sequences
#   )

#   translations = []
#   for translation in tokenizer.batch_decode(translated_tokens, skip_special_tokens=True):
#       translations.append(translation)

#   # result = {"input":text, "translations":translations}
#   return text, translations


############### IPEX OPTIMIZED INFERENCE ###############
# from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
# from optimum.bettertransformer import BetterTransformer
# import intel_extension_for_pytorch as ipex
# from transformers.modeling_outputs import BaseModelOutput
# import torch

# model_path = "anzorq/m2m100_418M_ft_ru-kbd_44K"
# src_lang = "ru"
# tgt_lang = "zu"

# tokenizer = AutoTokenizer.from_pretrained(model_path)
# model = AutoModelForSeq2SeqLM.from_pretrained(model_path)

# # flash attention optimization
# model = BetterTransformer.transform(model, keep_original_model=False)

# # ipex optimization
# model.eval()
# model = ipex.optimize(model, dtype=torch.float, level="O1", conv_bn_folding=False, inplace=True)

# # Get the encoder
# encoder = model.get_encoder()

# # Prepare an example input for the encoder
# example_input_text = "Example text in Russian"
# inputs_example = tokenizer(example_input_text, return_tensors="pt")

# # Trace just the encoder with strict=False
# scripted_encoder = torch.jit.trace(encoder, inputs_example['input_ids'], strict=False)

# def translate(text, num_beams=4, num_return_sequences=4):
#     inputs = tokenizer(text, return_tensors="pt")
#     num_return_sequences = min(num_return_sequences, num_beams)

#     # Use the scripted encoder for the first step of inference
#     encoder_output_dict = scripted_encoder(inputs['input_ids'])
#     encoder_outputs = BaseModelOutput(last_hidden_state=encoder_output_dict['last_hidden_state'])

#     # Use the original, untraced model for the second step, passing the encoder's outputs as inputs
#     translated_tokens = model.generate(
#         encoder_outputs=encoder_outputs,
#         forced_bos_token_id=tokenizer.lang_code_to_id[tgt_lang], 
#         num_beams=num_beams, 
#         num_return_sequences=num_return_sequences
#     )

#     translations = [tokenizer.decode(translation, skip_special_tokens=True) for translation in translated_tokens]
#     return text, translations

# ############### ONNX MODEL INFERENCE ###############
# from transformers import AutoTokenizer, pipeline
# from optimum.onnxruntime import ORTModelForSeq2SeqLM

# model_id = "anzorq/m2m100_418M_ft_ru-kbd_44K"

# model = ORTModelForSeq2SeqLM.from_pretrained(model_id, subfolder="onnx", file_name="encoder_model_optimized.onnx")
# tokenizer = AutoTokenizer.from_pretrained(model_id)

# def translate(text, num_beams=4, num_return_sequences=4):
#   inputs = tokenizer(text, return_tensors="pt")

#   num_return_sequences = min(num_return_sequences, num_beams)

#   translated_tokens = model.generate(
#       **inputs, forced_bos_token_id=tokenizer.lang_code_to_id["zu"], num_beams=num_beams, num_return_sequences=num_return_sequences
#   )

#   translations = []
#   for translation in tokenizer.batch_decode(translated_tokens, skip_special_tokens=True):
#       translations.append(translation)

#   return text, translations


############### CTRANSLATE2 INFERENCE ###############
import ctranslate2
import transformers

translator = ctranslate2.Translator("ctranslate")
tokenizer = transformers.AutoTokenizer.from_pretrained("anzorq/m2m100_418M_ft_ru-kbd_44K")

def translate(text, num_beams=4, num_return_sequences=4):

    num_return_sequences = min(num_return_sequences, num_beams)

    source = tokenizer.convert_ids_to_tokens(tokenizer.encode(text))
    target_prefix = [tokenizer.lang_code_to_token["zu"]]
    results = translator.translate_batch(
        [source],
        target_prefix=[target_prefix],
        beam_size=num_beams,
        num_hypotheses=num_return_sequences
    )
    
    translations = []
    for hypothesis in results[0].hypotheses:
        target = hypothesis[1:]
        decoded_sentence = tokenizer.decode(tokenizer.convert_tokens_to_ids(target))
        translations.append(decoded_sentence)
    
    return text, translations

output = gr.Textbox()
# with gr.Accordion("Advanced Options"):
num_beams = gr.inputs.Slider(2, 10, step=1, label="Number of beams", default=4)
num_return_sequences = gr.inputs.Slider(2, 10, step=1, label="Number of returned sentences", default=4)


title = "Russian-Circassian translator demo"
article = "<p style='text-align: center'>Want to help? Join the <a href='https://discord.gg/cXwv495r' target='_blank'>Discord server</a></p>"

# examples = [
#     ["Мы идем домой"],
#     ["Сегодня хорошая погода"],
#     ["Дети играют во дворе"],
#     ["We live in a big house"],
#     ["Tu es une bonne personne."],
#     ["أين تعيش؟"],
#     ["Bir şeyler yapmak istiyorum."],
#     ["– Если я его отпущу, то ты вовек не сможешь его поймать, – заявил Сосруко."],
#     ["Как только старик ушел, Сатаней пошла к Саусырыко."],
#     ["我永远不会放弃你。"],
#     ["우리는 소치에 살고 있습니다."],
# ]

gr.Interface(
  fn=translate,
  inputs=["text", num_beams, num_return_sequences],
  outputs=["text", output],
  title=title,
  # examples=examples,
  article=article).launch()

# import gradio as gr

# title = "Русско-черкесский переводчик"
# description = "Demo of a Russian-Circassian (Kabardian dialect) translator. <br>It is based on Facebook's <a href=\"https://about.fb.com/news/2020/10/first-multilingual-machine-translation-model/\">M2M-100 model</a> machine learning model, and has been trained on 45,000 Russian-Circassian sentence pairs. <br>It can also translate from 100 other languages to Circassian (English, French, Spanish, etc.), but less accurately. <br>The data corpus is constantly being expanded, and we need help in finding sentence sources, OCR, data cleaning, etc. <br>If you are interested in helping out with this project, please contact me at the link below.<br><br>This is only a demo, not a finished product. Translation quality is still low and will improve with time and more data.<br>45,000 sentence pairs is not enough to create an accurate machine translation model, and more data is needed.<br>You can help by finding sentence sources (books, web pages, etc.), scanning books, OCRing documents, data cleaning, and other tasks.<br><br>If you are interested in helping out with this project, contact me at the link below."
# article = """<p style='text-align: center'><a href='https://arxiv.org/abs/1806.00187'>Scaling Neural Machine Translation</a> | <a href='https://github.com/pytorch/fairseq/'>Github Repo</a></p>"""

# examples = [
#     ["Мы идем домой"],
#     ["Сегодня хорошая погода"],
#     ["Дети играют во дворе"],
#     ["We live in a big house"],
#     ["Tu es une bonne personne."],
#     ["أين تعيش؟"],
#     ["Bir şeyler yapmak istiyorum."],
# ]

# gr.Interface.load("models/anzorq/m2m100_418M_ft_ru-kbd_44K", title=title, description=description, article=article, examples=examples).launch()