Spaces:
Paused
Paused
import gradio as gr | |
from transformers import AutoTokenizer | |
from quantizer import load_quantized_model | |
import torch | |
title = """# 🙋🏻♂️Welcome to 🌟Tonic's 2-Bit Llama2 on GPU-Zero! """ | |
description = """ | |
this model a 2 bit quantized model using [QuIP for all](https://github.com/chu-tianxiang/QuIP-for-all/. You can try out [keyfan/Qwen-72B-Chat-2bit](https://huggingface.co/keyfan/Qwen-72B-Chat-2bit) below or try it locally by cloning or duplicating this space. Simply click here: <a style="display:inline-block" href="https://huggingface.co/spaces/Tonic/StableMed_Chat?duplicate=true"><img src="https://img.shields.io/badge/-Duplicate%20Space-blue?labelColor=white&style=flat&logo=&logoWidth=14" alt="Duplicate Space"></a></h3> | |
Join us : 🌟TeamTonic🌟 is always making cool demos! Join our active builder's🛠️community on 👻Discord: [Discord](https://discord.gg/GWpVpekp) On 🤗Huggingface: [TeamTonic](https://huggingface.co/TeamTonic) & [MultiTransformer](https://huggingface.co/MultiTransformer) On 🌐Github: [Polytonic](https://github.com/tonic-ai) & contribute to 🌟 [PolyGPT](https://github.com/tonic-ai/polygpt-alpha) | |
""" | |
# Load the quantized model | |
quant_dir = "llama-70b_2bit_quip" | |
quant_model = load_quantized_model(quant_dir).cuda() | |
tokenizer = AutoTokenizer.from_pretrained(quant_dir) | |
def generate_text(input_text): | |
input_ids = tokenizer.encode(input_text, return_tensors="pt").cuda() | |
output_ids = quant_model.generate(input_ids, do_sample=True)[0] | |
return tokenizer.decode(output_ids) | |
with gr.Blocks() as demo: | |
gr.Markdown(title) | |
gr.Markdown(description) | |
with gr.Row(): | |
input_text = gr.Textbox(label="Enter text here", placeholder="Type something...", lines=2) | |
submit_button = gr.Button("Generate") | |
output_text = gr.Textbox(label="Generated Text", readonly=True) | |
submit_button.click( | |
fn=generate_text, | |
inputs=input_text, | |
outputs=output_text | |
) | |
demo.launch() |