import os import torch from transformers import AutoTokenizer, AutoModelForCausalLM import gradio as gr hf_token = os.environ["hf_token"] device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") b_tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-1b1")#using small parameter version of model for faster inference on hf b_model = AutoModelForCausalLM.from_pretrained("bigscience/bloom-1b1",device_map = "auto") g_tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-2b",token = hf_token)#using small paramerter version of model for faster inference on hf g_model = AutoModelForCausalLM.from_pretrained("google/gemma-2-2b",token = hf_token,device_map="auto") def Sentence_Commpletion(model_name, input): if model_name == "Bloom": tokenizer, model = b_tokenizer, b_model inputss = tokenizer(input, return_tensors="pt") outputs = model.generate(inputss.input_ids, max_new_tokens=31, num_return_sequences=1) elif model_name == "Gemma": tokenizer, model = g_tokenizer, g_model inputs= tokenizer(input, return_tensors="pt") outputs = model.generate(inputs.input_ids, max_new_tokens=32) return tokenizer.decode(outputs[0],skip_special_tokens=True) interface = gr.Interface( fn=Sentence_Commpletion, inputs=[gr.Radio(["Bloom", "Gemma"], label="Choose model"), gr.Textbox(placeholder="Enter sentece"),], outputs="text", title="Bloom vs Gemma Sentence completion",) interface.launch(share = True, debug = True)