Spaces:

lightmate
/

llm-chatbot

Running

App Files Files Community

llm-chatbot / app.py

lightmate

Upload 5 files

657585b verified about 2 months ago

raw

history blame

4.09 kB

	import os
	import torch
	from transformers import AutoTokenizer, AutoConfig
	from optimum.intel.openvino import OVModelForCausalLM
	import openvino as ov
	import gradio as gr
	from gradio_helper import make_demo
	from llm_config import SUPPORTED_LLM_MODELS
	from pathlib import Path

	# Define model configuration
	model_language = "en" # Example: set to English
	model_id = "qwen2.5-0.5b-instruct" # Example model ID

	# Define model directories
	pt_model_id = SUPPORTED_LLM_MODELS[model_language][model_id]["model_id"]
	int4_model_dir = Path(model_id) / "INT4_compressed_weights"

	# Load tokenizer
	tok = AutoTokenizer.from_pretrained(int4_model_dir, trust_remote_code=True)

	# Ensure INT4 weights exist; if not, attempt conversion (locally before deployment)
	def check_and_convert_model():
	if not (int4_model_dir / "openvino_model.xml").exists():
	print("INT4 model weights not found. Attempting compression...")
	convert_to_int4()

	def convert_to_int4():
	"""
	Converts a model to INT4 precision using the optimum-cli tool.
	This function should only be run locally or in an environment that supports shell commands.
	"""
	# Define compression parameters
	compression_configs = {
	"qwen2.5-0.5b-instruct": {"sym": True, "group_size": 128, "ratio": 1.0},
	"default": {"sym": False, "group_size": 128, "ratio": 0.8},
	}

	model_compression_params = compression_configs.get(model_id, compression_configs["default"])

	# Check if the INT4 model already exists
	if (int4_model_dir / "openvino_model.xml").exists():
	print("INT4 model already exists.")
	return # Exit if the model is already converted

	# Run model compression using `optimum-cli`
	export_command_base = f"optimum-cli export openvino --model {pt_model_id} --task text-generation-with-past --weight-format int4"
	int4_compression_args = f" --group-size {model_compression_params['group_size']} --ratio {model_compression_params['ratio']}"
	if model_compression_params["sym"]:
	int4_compression_args += " --sym"

	# You can add other custom compression arguments here (like AWQ)
	export_command = export_command_base + int4_compression_args
	print(f"Running compression command: {export_command}")

	# Execute the export command (this is typically done locally, not in Hugging Face Spaces)
	# For deployment, the model needs to be pre-compressed and uploaded
	os.system(export_command)

	# Check if the INT4 model exists or needs conversion
	check_and_convert_model()

	# Initialize OpenVINO model
	core = ov.Core()
	ov_model = OVModelForCausalLM.from_pretrained(
	str(int4_model_dir),
	device="CPU", # Adjust device as needed (e.g., "GPU" or "CPU")
	config=AutoConfig.from_pretrained(str(int4_model_dir), trust_remote_code=True),
	trust_remote_code=True,
	)

	def convert_history_to_token(history):
	"""
	Convert the history of the conversation into tokens for the model.
	"""
	input_ids = tok.encode(history[-1][0]) # Example tokenization
	return torch.LongTensor([input_ids])

	def bot(history, temperature, top_p, top_k, repetition_penalty, conversation_id):
	"""
	Bot logic to process conversation history and generate responses.
	"""
	input_ids = convert_history_to_token(history)
	streamer = TextIteratorStreamer(tok, timeout=3600.0, skip_prompt=True, skip_special_tokens=True)
	generate_kwargs = dict(
	input_ids=input_ids,
	max_new_tokens=256,
	temperature=temperature,
	do_sample=temperature > 0.0,
	top_p=top_p,
	top_k=top_k,
	repetition_penalty=repetition_penalty,
	streamer=streamer,
	)

	# Generate response
	ov_model.generate(**generate_kwargs)

	# Stream and update history with generated response
	partial_text = ""
	for new_text in streamer:
	partial_text += new_text
	history[-1][1] = partial_text
	yield history

	# Gradio interface setup
	demo = make_demo(run_fn=bot, stop_fn=None, title="OpenVINO Chatbot", language="en")
	demo.launch(debug=True, share=True)