import torch from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TextIteratorStreamer from transformers.generation import LogitsProcessor from threading import Thread import gradio as gr print(f"Starting to load the model to memory") tokenizer = AutoTokenizer.from_pretrained("nort5_en-no_base") cls_index = tokenizer.convert_tokens_to_ids("[CLS]") sep_index = tokenizer.convert_tokens_to_ids("[SEP]") eos_index = tokenizer.convert_tokens_to_ids("[EOS]") pad_index = tokenizer.convert_tokens_to_ids("[PAD]") eng_index = tokenizer.convert_tokens_to_ids(">>eng<<") nob_index = tokenizer.convert_tokens_to_ids(">>nob<<") nno_index = tokenizer.convert_tokens_to_ids(">>nno<<") model = AutoModelForSeq2SeqLM.from_pretrained("nort5_en-no_base", trust_remote_code=True) device = "cuda" if torch.cuda.is_available() else "cpu" print(f"SYSTEM: Running on {device}", flush=True) model = model.to(device) model.eval() print(f"Sucessfully loaded the model to the memory") LANGUAGES = [ "🇬🇧 English", "🇳🇴 Norwegian (Bokmål)", "🇳🇴 Norwegian (Nynorsk)" ] LANGUAGE_IDS = { "🇬🇧 English": eng_index, "🇳🇴 Norwegian (Bokmål)": nob_index, "🇳🇴 Norwegian (Nynorsk)": nno_index } class BatchStreamer(TextIteratorStreamer): def put(self, value): print(value.shape) #if value.size(0) == 1: # return super().put(value) if len(self.token_cache) == 0: self.token_cache = [[] for _ in range(value.size(0))] value = value.tolist() # Add the new token to the cache and decodes the entire thing. for c, v in zip(self.token_cache, value): c += [v] if isinstance(v, int) else v paragraphs = [tokenizer.decode(c, **self.decode_kwargs).strip() for c in self.token_cache] text = '\n'.join(paragraphs) self.on_finalized_text(text) def end(self): if len(self.token_cache) > 0: paragraphs = [tokenizer.decode(c, **self.decode_kwargs).strip() for c in self.token_cache] printable_text = '\n'.join(paragraphs) self.token_cache = [] self.print_len = 0 else: printable_text = "" self.next_tokens_are_prompt = True self.on_finalized_text(printable_text, stream_end=True) class RepetitionPenaltyLogitsProcessor(LogitsProcessor): def __init__(self, penalty: float, model): last_bias = model.classifier.nonlinearity[-1].bias.data last_bias = torch.nn.functional.log_softmax(last_bias) self.penalty = penalty * (last_bias - last_bias.max()) def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor: penalized_score = torch.gather(scores + self.penalty.unsqueeze(0).to(input_ids.device), 1, input_ids) scores.scatter_(1, input_ids, penalized_score) return scores def translate(source, source_language, target_language): if source_language == target_language: yield source.strip() return source.strip() source = [s.strip() for s in source.split('\n')] source_subwords = tokenizer(source).input_ids source_subwords = [[cls_index, LANGUAGE_IDS[target_language], LANGUAGE_IDS[source_language]] + s + [sep_index] for s in source_subwords] source_subwords = [torch.tensor(s) for s in source_subwords] source_subwords = torch.nn.utils.rnn.pad_sequence(source_subwords, batch_first=True, padding_value=pad_index) source_subwords = source_subwords[:, :512].to(device) streamer = BatchStreamer(tokenizer, timeout=60.0, skip_special_tokens=True) def generate(model, **kwargs): with torch.inference_mode(): with torch.autocast(enabled=device != "cpu", device_type=device, dtype=torch.bfloat16): return model.generate(**kwargs) generate_kwargs = dict( streamer=streamer, input_ids=source_subwords, attention_mask=(source_subwords != pad_index).long(), max_new_tokens = 512-1, #top_k=64, #top_p=0.95, #do_sample=True, #temperature=0.3, num_beams=1, #use_cache=True, logits_processor=[RepetitionPenaltyLogitsProcessor(1.0, model)], # num_beams=4, # early_stopping=True, do_sample=False, use_cache=True ) t = Thread(target=generate, args=(model,), kwargs=generate_kwargs) t.start() for new_text in streamer: yield new_text.strip() return new_text.strip() def switch_inputs(source, target, source_language, target_language): return target, source, target_language, source_language with gr.Blocks() as demo: # with gr.Blocks(theme='sudeepshouche/minimalist') as demo: gr.Markdown("# Norwegian-English translation") with gr.Row(): with gr.Column(scale=7, variant="panel"): source_language = gr.Dropdown( LANGUAGES, value=LANGUAGES[1], show_label=False ) source = gr.Textbox( label="Source text", placeholder="What do you want to translate?", show_label=False, lines=7, max_lines=100, autofocus=True ) # .style(container=False) submit = gr.Button("Submit", variant="primary") # .style(full_width=True) with gr.Column(scale=7, variant="panel"): target_language = gr.Dropdown( LANGUAGES, value=LANGUAGES[0], show_label=False ) target = gr.Textbox( label="Translation", show_label=False, interactive=False, lines=7, max_lines=100 ) def update_state_after_user(): return { source: gr.update(interactive=False), submit: gr.update(interactive=False), source_language: gr.update(interactive=False), target_language: gr.update(interactive=False) } def update_state_after_return(): return { source: gr.update(interactive=True), submit: gr.update(interactive=True), source_language: gr.update(interactive=True), target_language: gr.update(interactive=True) } submit_event = source.submit( fn=update_state_after_user, inputs=None, outputs=[source, submit, source_language, target_language], queue=False ).then( fn=translate, inputs=[source, source_language, target_language], outputs=[target], queue=True ).then( fn=update_state_after_return, inputs=None, outputs=[source, submit, source_language, target_language], queue=False ) submit_click_event = submit.click( fn=update_state_after_user, inputs=None, outputs=[source, submit, source_language, target_language], queue=False ).then( fn=translate, inputs=[source, source_language, target_language], outputs=[target], queue=True ).then( fn=update_state_after_return, inputs=None, outputs=[source, submit, source_language, target_language], queue=False ) demo.queue(max_size=32, concurrency_count=2) demo.launch()