2-Bit-Quip / app.py
Tonic's picture
Create app.py
dbe816b verified
raw
history blame
2.43 kB
import gradio as gr
from transformers import AutoTokenizer
from quantizer import load_quantized_model
import torch
title = """# 🙋🏻‍♂️Welcome to 🌟Tonic's 2-Bit Llama2 on GPU-Zero! """
description = """
this model a 2 bit quantized model using [QuIP for all](https://github.com/chu-tianxiang/QuIP-for-all/. You can try out [keyfan/Qwen-72B-Chat-2bit](https://huggingface.co/keyfan/Qwen-72B-Chat-2bit) below or try it locally by cloning or duplicating this space. Simply click here: <a style="display:inline-block" href="https://huggingface.co/spaces/Tonic/StableMed_Chat?duplicate=true"><img src="https://img.shields.io/badge/-Duplicate%20Space-blue?labelColor=white&style=flat&logo=&logoWidth=14" alt="Duplicate Space"></a></h3>
Join us : 🌟TeamTonic🌟 is always making cool demos! Join our active builder's🛠️community on 👻Discord: [Discord](https://discord.gg/GWpVpekp) On 🤗Huggingface: [TeamTonic](https://huggingface.co/TeamTonic) & [MultiTransformer](https://huggingface.co/MultiTransformer) On 🌐Github: [Polytonic](https://github.com/tonic-ai) & contribute to 🌟 [PolyGPT](https://github.com/tonic-ai/polygpt-alpha)
"""
# Load the quantized model
quant_dir = "llama-70b_2bit_quip"
quant_model = load_quantized_model(quant_dir).cuda()
tokenizer = AutoTokenizer.from_pretrained(quant_dir)
def generate_text(input_text):
input_ids = tokenizer.encode(input_text, return_tensors="pt").cuda()
output_ids = quant_model.generate(input_ids, do_sample=True)[0]
return tokenizer.decode(output_ids)
with gr.Blocks() as demo:
gr.Markdown(title)
gr.Markdown(description)
with gr.Row():
input_text = gr.Textbox(label="Enter text here", placeholder="Type something...", lines=2)
submit_button = gr.Button("Generate")
output_text = gr.Textbox(label="Generated Text", readonly=True)
submit_button.click(
fn=generate_text,
inputs=input_text,
outputs=output_text
)
demo.launch()